diff --git a/hasjob/tagging.py b/hasjob/tagging.py index c2a5b156..ed174916 100644 --- a/hasjob/tagging.py +++ b/hasjob/tagging.py @@ -1,9 +1,10 @@ from collections import defaultdict +from collections.abc import Iterable from urllib.parse import urljoin +import nltk import requests -from coaster.nlp import extract_named_entities from coaster.utils import text_blocks from . import app, rq @@ -25,6 +26,35 @@ @rq.job('hasjob') +def extract_named_entities(text_blocks: Iterable[str]) -> set[str]: + """Return a set of named entities extracted from the provided text blocks.""" + sentences = [] + for text in text_blocks: + sentences.extend(nltk.sent_tokenize(text)) + + tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] + tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] + chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True) + + def extract_entity_names(tree: nltk.Tree) -> list[str]: + entity_names = [] + + if hasattr(tree, "label"): + if tree.label() == "NE": + entity_names.append(" ".join(child[0] for child in tree)) + else: + for child in tree: + entity_names.extend(extract_entity_names(child)) + + return entity_names + + entity_names = [] + for tree in chunked_sentences: + entity_names.extend(extract_entity_names(tree)) + + return set(entity_names) + + def tag_locations(jobpost_id): with app.test_request_context(): post = JobPost.query.get(jobpost_id) diff --git a/pyproject.toml b/pyproject.toml index 12c2a135..3bc2c7ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -86,64 +86,6 @@ exclude_dirs = ['node_modules', 'build/lib'] skips = ['*/*_test.py', '*/test_*.py'] [tool.ruff] -# This is a slight customisation of the default rules -# 1. Hasjob still supports Python 3.7 pending its EOL -# 2. Rule E402 (module-level import not top-level) is disabled as isort handles it -# 3. Rule E501 (line too long) is left to Black; some strings are worse for wrapping - -# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default. -select = ["E", "F"] -ignore = ["E402", "E501"] - -# Allow autofix for all enabled rules (when `--fix`) is provided. -fixable = [ - "A", - "B", - "C", - "D", - "E", - "F", - "G", - "I", - "N", - "Q", - "S", - "T", - "W", - "ANN", - "ARG", - "BLE", - "COM", - "DJ", - "DTZ", - "EM", - "ERA", - "EXE", - "FBT", - "ICN", - "INP", - "ISC", - "NPY", - "PD", - "PGH", - "PIE", - "PL", - "PT", - "PTH", - "PYI", - "RET", - "RSE", - "RUF", - "SIM", - "SLF", - "TCH", - "TID", - "TRY", - "UP", - "YTT", -] -unfixable = [] - # Exclude a variety of commonly ignored directories. exclude = [ ".bzr", @@ -171,23 +113,48 @@ exclude = [ # Same as Black. line-length = 88 +# Target Python 3.11 +target-version = "py311" + +[tool.ruff.format] +docstring-code-format = true +quote-style = "preserve" + +[tool.ruff.lint] # Allow unused variables when underscore-prefixed. dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" -# Target Python 3.11 -target-version = "py311" +# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default. +select = ["E", "F"] +ignore = ["E402", "E501"] -[tool.ruff.mccabe] +# Allow autofix for all enabled rules (when `--fix`) is provided. +fixable = ["ALL"] +unfixable = [] + +# Allow these characters in strings +allowed-confusables = ["‘", "’"] + +[tool.ruff.lint.mccabe] # Unlike Flake8, default to a complexity level of 10. max-complexity = 10 -[tool.ruff.isort] +[tool.ruff.lint.extend-per-file-ignores] +"__init__.py" = ["E402"] # Allow non-top-level imports +"tests/**.py" = [ + "S101", # Allow assert + "ANN001", # Args don't need types (usually fixtures) + "N802", # Fixture returning a class may be named per class name convention + "N803", # Args don't require naming convention (fixture could be a class) +] + +[tool.ruff.lint.isort] # These config options should match isort config above under [tool.isort] combine-as-imports = true extra-standard-library = ['typing_extensions'] split-on-trailing-comma = false relative-imports-order = 'furthest-to-closest' -known-first-party = ['coaster'] +known-first-party = ['coaster', 'baseframe', 'flask_lastuser'] section-order = [ 'future', 'standard-library', @@ -197,5 +164,12 @@ section-order = [ 'local-folder', ] -[tool.ruff.isort.sections] +[tool.ruff.lint.isort.sections] repo = ['hasjob'] + +[tool.ruff.lint.flake8-pytest-style] +fixture-parentheses = false +mark-parentheses = false + +[tool.ruff.lint.pyupgrade] +keep-runtime-typing = true diff --git a/requirements.txt b/requirements.txt index 8eee7d60..1bf9cc8b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,6 @@ Flask-Migrate Flask-Redis Flask-RQ2 Flask-SQLAlchemy -Flask-Testing git+https://github.com/maxcountryman/flask-uploads.git#egg=Flask-Uploads Flask-WTF geoip2 @@ -19,6 +18,7 @@ gunicorn html2text jsmin langid +nltk Pillow premailer progressbar2