From f8187d50af97c1cd33cd4b1ef6ad9b5ccc6c2f08 Mon Sep 17 00:00:00 2001 From: Henning Holgersen Date: Fri, 22 Dec 2023 11:21:53 +0100 Subject: [PATCH 1/4] added working sync test --- .gitignore | 2 + pyproject.toml | 2 + tests/data_responses/13861.json | 212 ++++++++++++++++++++++++++++++ tests/schema_responses/13861.json | 110 ++++++++++++++++ tests/test_core.py | 22 ---- tests/test_sync.py | 82 ++++++++++++ 6 files changed, 408 insertions(+), 22 deletions(-) create mode 100644 .gitignore create mode 100644 tests/data_responses/13861.json create mode 100644 tests/schema_responses/13861.json delete mode 100644 tests/test_core.py create mode 100644 tests/test_sync.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7ee7f5b --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +poetry.lock diff --git a/pyproject.toml b/pyproject.toml index bc14e46..8bd1678 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,8 @@ requests = "~=2.31.0" [tool.poetry.group.dev.dependencies] pytest = ">=7.4.0" singer-sdk = { version="~=0.32.0", extras = ["testing"] } +responses = ">0.1.0" +pytest-cov = ">=3.0.0" [tool.poetry.extras] s3 = ["fs-s3fs"] diff --git a/tests/data_responses/13861.json b/tests/data_responses/13861.json new file mode 100644 index 0000000..5c133f3 --- /dev/null +++ b/tests/data_responses/13861.json @@ -0,0 +1,212 @@ +{ + "version": "2.0", + "class": "dataset", + "label": "13861: Lønnsfordelingen belyst ved ulikhetsmålene Gini-koeffisient og P90/P10, etter næring (SN2007), statistikkvariabel og år", + "source": "Statistisk sentralbyrå", + "updated": "2023-02-02T07:00:00Z", + "note": [ + "Vi kan ikke oppgi tall for næringene «T Lønnet arbeid i private husholdninger», «U Internasjonale organisasjoner og organer» og «Z Ufordelt næring» grunnet få observasjoner." + ], + "role": { + "time": [ + "Tid" + ], + "metric": [ + "ContentsCode" + ] + }, + "id": [ + "NACE2007", + "ContentsCode", + "Tid" + ], + "size": [ + 4, + 2, + 7 + ], + "dimension": { + "NACE2007": { + "label": "næring (SN2007)", + "category": { + "index": { + "A-Z": 0, + "A": 1, + "B": 2, + "C": 3 + }, + "label": { + "A-Z": "Alle næringer", + "A": "Jordbruk, skogbruk og fiske", + "B": "Bergverksdrift og utvinning", + "C": "Industri" + } + }, + "extension": { + "elimination": true, + "eliminationValueCode": "A-Z", + "show": "code_value" + }, + "link": { + "describedby": [ + { + "extension": { + "NACE2007": "urn:ssb:classification:klass:6" + } + } + ] + } + }, + "ContentsCode": { + "label": "statistikkvariabel", + "category": { + "index": { + "Ginikoeffisient": 0, + "P90P10": 1 + }, + "label": { + "Ginikoeffisient": "Ginikoeffisient", + "P90P10": "P90/P10" + }, + "note": { + "Ginikoeffisient": [ + "Gini-koeffisienten er et summarisk mål som varierer fra 0 (minst ulikhet) og 1 (størst ulikhet)." + ], + "P90P10": [ + "P90/P10 viser til forholdstallet mellom lønnen til den personen som befinner seg mellom desil 9 og desil 10 (P90) og lønnen til den personen som befinner seg mellom desil 1 og desil 2 (P10)." + ] + }, + "unit": { + "Ginikoeffisient": { + "base": "ginikoeffisient", + "decimals": 3 + }, + "P90P10": { + "base": "forholdstall", + "decimals": 1 + } + } + }, + "extension": { + "elimination": false, + "refperiod": { + "Ginikoeffisient": "", + "P90P10": "" + }, + "show": "value" + } + }, + "Tid": { + "label": "år", + "category": { + "index": { + "2016": 0, + "2017": 1, + "2018": 2, + "2019": 3, + "2020": 4, + "2021": 5, + "2022": 6 + }, + "label": { + "2016": "2016", + "2017": "2017", + "2018": "2018", + "2019": "2019", + "2020": "2020", + "2021": "2021", + "2022": "2022" + } + }, + "extension": { + "elimination": false, + "show": "code" + } + } + }, + "extension": { + "px": { + "infofile": "None", + "tableid": "13861", + "decimals": 1, + "official-statistics": true, + "aggregallowed": false, + "language": "no", + "matrix": "Ginikoeffisient", + "subject-code": "al" + }, + "contact": [ + { + "name": "Håkon Grini", + "phone": "482 05 163", + "mail": "gri@ssb.no", + "raw": "Håkon Grini, Statistisk sentralbyrå# +47 482 05 163#gri@ssb.no" + }, + { + "name": "Knut Snellingen Bye", + "phone": "408 11 326", + "mail": "ksb@ssb.no", + "raw": "Knut Snellingen Bye, Statistisk sentralbyrå# +47 408 11 326#ksb@ssb.no" + } + ] + }, + "value": [ + 0.205, + 0.204, + 0.206, + 0.206, + 0.206, + 0.209, + 0.211, + 2.3, + 2.3, + 2.3, + 2.3, + 2.3, + 2.3, + 2.3, + 0.215, + 0.224, + 0.221, + 0.221, + 0.227, + 0.228, + 0.232, + 2.4, + 2.5, + 2.6, + 2.6, + 2.6, + 2.7, + 2.7, + 0.226, + 0.219, + 0.227, + 0.226, + 0.222, + 0.221, + 0.233, + 2.7, + 2.6, + 2.7, + 2.7, + 2.6, + 2.6, + 2.8, + 0.205, + 0.203, + 0.205, + 0.204, + 0.202, + 0.204, + 0.207, + 2.3, + 2.2, + 2.3, + 2.3, + 2.2, + 2.3, + 2.3 + ] + } + \ No newline at end of file diff --git a/tests/schema_responses/13861.json b/tests/schema_responses/13861.json new file mode 100644 index 0000000..f02bf1d --- /dev/null +++ b/tests/schema_responses/13861.json @@ -0,0 +1,110 @@ +{ + "title": "13861: Lønnsfordelingen belyst ved ulikhetsmålene Gini-koeffisient og P90/P10, etter næring (SN2007), kjønn, statistikkvariabel og år", + "variables": [ + { + "code": "NACE2007", + "text": "næring (SN2007)", + "values": [ + "A-Z", + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "J", + "K", + "L", + "M", + "N", + "O", + "P", + "Q", + "R", + "S", + "T", + "U", + "Z" + ], + "valueTexts": [ + "Alle næringer", + "Jordbruk, skogbruk og fiske", + "Bergverksdrift og utvinning", + "Industri", + "Elektrisitets-, gass-, damp- og varmtvannsforsyning", + "Vann, avløp, renovasjon", + "Bygge- og anleggsvirksomhet", + "Varehandel, reparasjon av motorvogner", + "Transport og lagring", + "Overnattings- og serveringsvirksomhet", + "Informasjon og kommunikasjon", + "Finansierings- og forsikringsvirksomhet", + "Omsetning og drift av fast eiendom", + "Faglig, vitenskapelig og teknisk tjenesteyting", + "Forretningsmessig tjenesteyting", + "Offentlig administrasjon og forsvar, og trygdeordninger underlagt offentlig forvaltning", + "Undervisning", + "Helse- og sosialtjenester", + "Kultur, underholdning og fritid", + "Annen tjenesteyting", + "Lønnet arbeid i private husholdninger", + "Internasjonale organisasjoner og organer", + "Ufordelt næring" + ], + "elimination": true + }, + { + "code": "Kjonn", + "text": "kjønn", + "values": [ + "0", + "1", + "2" + ], + "valueTexts": [ + "Begge kjønn", + "Menn", + "Kvinner" + ], + "elimination": true + }, + { + "code": "ContentsCode", + "text": "statistikkvariabel", + "values": [ + "Ginikoeffisient", + "P90P10" + ], + "valueTexts": [ + "Ginikoeffisient", + "P90/P10" + ] + }, + { + "code": "Tid", + "text": "år", + "values": [ + "2016", + "2017", + "2018", + "2019", + "2020", + "2021", + "2022" + ], + "valueTexts": [ + "2016", + "2017", + "2018", + "2019", + "2020", + "2021", + "2022" + ], + "time": true + } + ] + } \ No newline at end of file diff --git a/tests/test_core.py b/tests/test_core.py deleted file mode 100644 index 7a7f86a..0000000 --- a/tests/test_core.py +++ /dev/null @@ -1,22 +0,0 @@ -"""Tests standard tap features using the built-in SDK tests library.""" - -import datetime - -from singer_sdk.testing import get_tap_test_class - -from tap_pxwebapi.tap import Tappxwebapi - -SAMPLE_CONFIG = { - "start_date": datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d"), - # TODO: Initialize minimal tap config -} - - -# Run standard built-in tap tests from the SDK: -TestTappxwebapi = get_tap_test_class( - tap_class=Tappxwebapi, - config=SAMPLE_CONFIG, -) - - -# TODO: Create additional tests as appropriate for your tap. diff --git a/tests/test_sync.py b/tests/test_sync.py new file mode 100644 index 0000000..05e06ea --- /dev/null +++ b/tests/test_sync.py @@ -0,0 +1,82 @@ +import json +import responses +from tap_pxwebapi.tap import Tappxwebapi +import re +SAMPLE_CONFIG = { + "tables": [ + { + "table_name": "test_table", + "table_id": "13861", + "select": [ + { + "code": "NACE2007", + "values": ["A-Z","A","B","C"] + } + ] + } + ], +} + +# SAMPLE_REQUEST = { +# "query": [ +# { +# "code": "NACE2007", +# "selection": { +# "filter": "item", +# "values": ["A-Z","A","B","C"] +# } +# }, +# { +# "code": "Tid", +# "selection": { +# "filter": "item", +# "values": ["2016","2017","2018","2019","2020","2021","2022"] +# } +# } +# ], +# "response": { +# "format": "json-stat2" +# } +# } + +SCHEMA_RESPONSE_TXT = open( + "tests/schema_responses/13861.json" +).read() + +DATA_RESPONSE_TXT = open( + "tests/data_responses/13861.json" +).read() + +SCHEMA_RESPONSE = json.loads(SCHEMA_RESPONSE_TXT) +DATA_RESPONSE = json.loads(DATA_RESPONSE_TXT) + + +@responses.activate +def test_stuff(capsys): + + responses.add_callback( + responses.POST, + re.compile(r"https://data.ssb.no/api/v0/en/table/13861"), + callback=lambda _: (200, {}, DATA_RESPONSE_TXT), + ) + + schema = responses.add( + responses.GET, + "https://data.ssb.no/api/v0/en/table/13861", + json=SCHEMA_RESPONSE, + status=200 + ) + + tap1 = Tappxwebapi(config=SAMPLE_CONFIG) + _ = tap1.streams["test_table"].sync(None) + + captured = capsys.readouterr() + all_stdout = captured.out.strip() + stdout_parts = all_stdout.split("\n") + + json_messages = [json.loads(line) for line in stdout_parts] + data_records = [msg for msg in json_messages if msg.get("type") == "RECORD"] + schema_records = [msg for msg in json_messages if msg.get("type") == "SCHEMA"] + + assert len(data_records) == 4*2*7 + assert schema.call_count == 1 From 502333d1974d25f3c87a855dcfe3e00aaecdef63 Mon Sep 17 00:00:00 2001 From: Henning Holgersen Date: Fri, 22 Dec 2023 11:54:54 +0100 Subject: [PATCH 2/4] linting --- .github/workflows/ci.yaml | 63 +++++++++++++++++++++++++++++++++ .gitignore | 2 ++ pyproject.toml | 4 +++ tap_pxwebapi/streams.py | 74 ++++++++++++--------------------------- tap_pxwebapi/tap.py | 13 +++---- tests/test_sync.py | 42 +++++++++++----------- tox.ini | 25 +++++++++++++ 7 files changed, 141 insertions(+), 82 deletions(-) create mode 100644 .github/workflows/ci.yaml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..bb7131a --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,63 @@ +### A CI workflow template that runs linting and python testing +### TODO: Modify as needed or as desired. + +name: Test tap-sharepointsites + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + linting: + + runs-on: ubuntu-latest + strategy: + matrix: + # Only lint using the primary version used for dev + python-version: ["3.9", "3.11"] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install Poetry + run: | + python -m pip install --upgrade pip + pip install poetry==1.2.* + pip install tox + - name: Run lint command from tox.ini + run: | + poetry run tox -e lint + - name: Commit changes + uses: stefanzweifel/git-auto-commit-action@v4 + with: + commit_message: Apply code formatting + pytest: + runs-on: ubuntu-latest + needs: linting + env: + GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} + strategy: + matrix: + python-version: [3.9] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install Poetry + run: | + python -m pip install --upgrade pip + pip install poetry==1.2.* + - name: Install dependencies + run: | + poetry install + - name: Test with pytest + run: | + poetry run pytest --capture=no \ No newline at end of file diff --git a/.gitignore b/.gitignore index 7ee7f5b..6580c5a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ __pycache__ poetry.lock +.coverage +.tox \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 8bd1678..c150f98 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,10 @@ pytest = ">=7.4.0" singer-sdk = { version="~=0.32.0", extras = ["testing"] } responses = ">0.1.0" pytest-cov = ">=3.0.0" +tox = "^3.14.3" +isort = "^5.10.1" +flake8 = "^3.9.2" +black = "^22.3.0" [tool.poetry.extras] s3 = ["fs-s3fs"] diff --git a/tap_pxwebapi/streams.py b/tap_pxwebapi/streams.py index 80350c1..c6d4142 100644 --- a/tap_pxwebapi/streams.py +++ b/tap_pxwebapi/streams.py @@ -2,19 +2,17 @@ from __future__ import annotations -import typing as t -from pathlib import Path import hashlib +from functools import cached_property +from pathlib import Path +from typing import Iterable + +import requests from singer_sdk import typing as th # JSON Schema typing helpers -from typing import Any, Callable, Iterable + from tap_pxwebapi.client import pxwebapiStream -import requests -from functools import cached_property -# TODO: Delete this is if not using json files for schema definition SCHEMAS_DIR = Path(__file__).parent / Path("./schemas") -# TODO: - Override `UsersStream` and `GroupsStream` with your own stream definition. -# - Copy-paste as many times as needed to create multiple stream types. class TablesStream(pxwebapiStream): @@ -33,7 +31,6 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - @property def url_base(self) -> str: return self.config["base_url"] @@ -42,19 +39,19 @@ def url_base(self) -> str: def path(self) -> str: """Return API endpoint path string.""" return f"en/table/{self.table_config['table_id']}" - + @property def name(self) -> str: """Return a human-readable name for this stream.""" return self.table_config["table_name"] - + @staticmethod def json_stat2_to_rows(json_stat2): rows = [] dimensions = json_stat2["dimension"] values = json_stat2["value"] dimension_keys = list(dimensions.keys()) - + def recursive_build_row(dim_index, current_row): if dim_index == len(dimension_keys): current_row["value"] = values[len(rows)] @@ -87,31 +84,20 @@ def parse_response(self, response: requests.Response) -> Iterable[dict]: rows = self.json_stat2_to_rows(json_stat2) for i, row in enumerate(rows): - hash_object = hashlib.sha256() - row["_sdc_row_hash"] = self.create_hash_from_dict(row) yield row - def prepare_request_payload( - self, context: dict | None, next_page_token: _TToken | None + self, context: dict | None, next_page_token ) -> dict | None: """Prepare the data payload for the REST API request.""" - base_payload = { - "query": [], - "response": { - "format": "json-stat2" - } - } + base_payload = {"query": [], "response": {"format": "json-stat2"}} for select in self.table_config.get("select", []): column_payload = { "code": select["code"], - "selection": { - "filter": "item", - "values": select["values"] - } + "selection": {"filter": "item", "values": select["values"]}, } base_payload["query"].append(column_payload) @@ -120,11 +106,13 @@ def prepare_request_payload( if not last_time: return base_payload - + self.logger.info("time_items: " + str(self.time_items)) - + if len(self.time_items) == 1: - new_times = [item for item in self.time_items[0]["values"] if item > last_time] + new_times = [ + item for item in self.time_items[0]["values"] if item > last_time + ] self.logger.info("new_times: " + str(new_times)) if not new_times: @@ -132,19 +120,13 @@ def prepare_request_payload( self.logger.info("No new times, fetching latest period") time_payload = { "code": self.time_items[0]["code"], - "selection": { - "filter": "item", - "values": [last_time] - } + "selection": {"filter": "item", "values": [last_time]}, } else: self.logger.info(f"New times found, fetching new times {new_times}") time_payload = { "code": self.time_items[0]["code"], - "selection": { - "filter": "item", - "values": new_times - } + "selection": {"filter": "item", "values": new_times}, } base_payload["query"].append(time_payload) @@ -152,12 +134,10 @@ def prepare_request_payload( self.logger.info("payload: " + str(base_payload)) return base_payload - - @cached_property def schema(self) -> th.PropertiesList: - + r = requests.get(self.url_base + self.path) r.raise_for_status() @@ -165,7 +145,7 @@ def schema(self) -> th.PropertiesList: self.time_items = time_variable properties = th.PropertiesList() - + for item in r.json()["variables"]: properties.append( @@ -188,18 +168,8 @@ def schema(self) -> th.PropertiesList: properties.append( th.Property( - "_sdc_row_hash", - th.StringType, - description="Row number", - required=True + "_sdc_row_hash", th.StringType, description="Row number", required=True ) ) return properties.to_dict() - - - - - - - diff --git a/tap_pxwebapi/tap.py b/tap_pxwebapi/tap.py index 77cc536..f9b7b95 100644 --- a/tap_pxwebapi/tap.py +++ b/tap_pxwebapi/tap.py @@ -45,15 +45,15 @@ class Tappxwebapi(Tap): th.Property( "values", th.ArrayType(th.StringType), - ) + ), ) - ) - ) + ), + ), ) ), required=True, description="Tables to read", - ) + ), ).to_dict() def discover_streams(self) -> list[streams.TablesStream]: @@ -64,10 +64,7 @@ def discover_streams(self) -> list[streams.TablesStream]: """ for table in self.config["tables"]: - yield streams.TablesStream( - tap=self, - table_config=table - ) + yield streams.TablesStream(tap=self, table_config=table) if __name__ == "__main__": diff --git a/tests/test_sync.py b/tests/test_sync.py index 05e06ea..86161a6 100644 --- a/tests/test_sync.py +++ b/tests/test_sync.py @@ -17,27 +17,20 @@ ], } -# SAMPLE_REQUEST = { -# "query": [ -# { -# "code": "NACE2007", -# "selection": { -# "filter": "item", -# "values": ["A-Z","A","B","C"] -# } -# }, -# { -# "code": "Tid", -# "selection": { -# "filter": "item", -# "values": ["2016","2017","2018","2019","2020","2021","2022"] -# } -# } -# ], -# "response": { -# "format": "json-stat2" -# } -# } +SAMPLE_REQUEST = { + "query": [ + { + "code": "NACE2007", + "selection": { + "filter": "item", + "values": ["A-Z","A","B","C"] + } + } + ], + "response": { + "format": "json-stat2" + }, + } SCHEMA_RESPONSE_TXT = open( "tests/schema_responses/13861.json" @@ -50,6 +43,11 @@ SCHEMA_RESPONSE = json.loads(SCHEMA_RESPONSE_TXT) DATA_RESPONSE = json.loads(DATA_RESPONSE_TXT) +def validate_request(request): + jbod = request.body.decode("utf-8") + + assert jbod == json.dumps(SAMPLE_REQUEST) + return (200, {}, DATA_RESPONSE_TXT) @responses.activate def test_stuff(capsys): @@ -57,7 +55,7 @@ def test_stuff(capsys): responses.add_callback( responses.POST, re.compile(r"https://data.ssb.no/api/v0/en/table/13861"), - callback=lambda _: (200, {}, DATA_RESPONSE_TXT), + callback=validate_request, ) schema = responses.add( diff --git a/tox.ini b/tox.ini index 70b9e4a..79da51a 100644 --- a/tox.ini +++ b/tox.ini @@ -17,3 +17,28 @@ envlist = py37, py38, py39, py310, py311 commands = poetry install -v poetry run pytest + + +[testenv:format] +# Attempt to auto-resolve lint errors before they are raised. +# To execute, run `tox -e format` +commands = + poetry install -v + poetry run black tap_pxwebapi/ + poetry run isort tap_pxwebapi + +[testenv:lint] +# Raise an error if lint and style standards are not met. +# To execute, run `tox -e lint` +commands = + poetry install -v + poetry run black --diff tap_pxwebapi/ + poetry run isort --check tap_pxwebapi + poetry run flake8 tap_pxwebapi + # refer to mypy.ini for specific settings + ; poetry run mypy tap_pxwebapi --exclude='tap_pxwebapi/tests' + +[flake8] +ignore = W503 +max-line-length = 120 +max-complexity = 10 From 46fbf86c5af15923b3eeea52dc0ad5805f316540 Mon Sep 17 00:00:00 2001 From: Henning Holgersen Date: Fri, 22 Dec 2023 12:19:55 +0100 Subject: [PATCH 3/4] matrix fix --- .github/workflows/ci.yaml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index bb7131a..04b194e 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -16,7 +16,7 @@ jobs: strategy: matrix: # Only lint using the primary version used for dev - python-version: ["3.9", "3.11"] + python-version: ["3.11"] steps: - uses: actions/checkout@v4 @@ -39,11 +39,10 @@ jobs: pytest: runs-on: ubuntu-latest needs: linting - env: - GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}} strategy: + max-parallel: 5 matrix: - python-version: [3.9] + python-version: ["3.9", "3.10", "3.11"] steps: - uses: actions/checkout@v4 From 6ce2e6294c491c0dd4ab2ad57de658d6646a566f Mon Sep 17 00:00:00 2001 From: Henning Holgersen Date: Fri, 22 Dec 2023 13:05:06 +0100 Subject: [PATCH 4/4] added schema test --- tests/test_sync.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_sync.py b/tests/test_sync.py index 86161a6..e18fd24 100644 --- a/tests/test_sync.py +++ b/tests/test_sync.py @@ -77,4 +77,5 @@ def test_stuff(capsys): schema_records = [msg for msg in json_messages if msg.get("type") == "SCHEMA"] assert len(data_records) == 4*2*7 + assert len(schema_records) == 1 assert schema.call_count == 1