diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..04b194e --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,62 @@ +### A CI workflow template that runs linting and python testing +### TODO: Modify as needed or as desired. + +name: Test tap-sharepointsites + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + linting: + + runs-on: ubuntu-latest + strategy: + matrix: + # Only lint using the primary version used for dev + python-version: ["3.11"] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v3 + with: + python-version: ${{ matrix.python-version }} + - name: Install Poetry + run: | + python -m pip install --upgrade pip + pip install poetry==1.2.* + pip install tox + - name: Run lint command from tox.ini + run: | + poetry run tox -e lint + - name: Commit changes + uses: stefanzweifel/git-auto-commit-action@v4 + with: + commit_message: Apply code formatting + pytest: + runs-on: ubuntu-latest + needs: linting + strategy: + max-parallel: 5 + matrix: + python-version: ["3.9", "3.10", "3.11"] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install Poetry + run: | + python -m pip install --upgrade pip + pip install poetry==1.2.* + - name: Install dependencies + run: | + poetry install + - name: Test with pytest + run: | + poetry run pytest --capture=no \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6580c5a --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +__pycache__ +poetry.lock +.coverage +.tox \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index bc14e46..c150f98 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,12 @@ requests = "~=2.31.0" [tool.poetry.group.dev.dependencies] pytest = ">=7.4.0" singer-sdk = { version="~=0.32.0", extras = ["testing"] } +responses = ">0.1.0" +pytest-cov = ">=3.0.0" +tox = "^3.14.3" +isort = "^5.10.1" +flake8 = "^3.9.2" +black = "^22.3.0" [tool.poetry.extras] s3 = ["fs-s3fs"] diff --git a/tap_pxwebapi/streams.py b/tap_pxwebapi/streams.py index 80350c1..c6d4142 100644 --- a/tap_pxwebapi/streams.py +++ b/tap_pxwebapi/streams.py @@ -2,19 +2,17 @@ from __future__ import annotations -import typing as t -from pathlib import Path import hashlib +from functools import cached_property +from pathlib import Path +from typing import Iterable + +import requests from singer_sdk import typing as th # JSON Schema typing helpers -from typing import Any, Callable, Iterable + from tap_pxwebapi.client import pxwebapiStream -import requests -from functools import cached_property -# TODO: Delete this is if not using json files for schema definition SCHEMAS_DIR = Path(__file__).parent / Path("./schemas") -# TODO: - Override `UsersStream` and `GroupsStream` with your own stream definition. -# - Copy-paste as many times as needed to create multiple stream types. class TablesStream(pxwebapiStream): @@ -33,7 +31,6 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - @property def url_base(self) -> str: return self.config["base_url"] @@ -42,19 +39,19 @@ def url_base(self) -> str: def path(self) -> str: """Return API endpoint path string.""" return f"en/table/{self.table_config['table_id']}" - + @property def name(self) -> str: """Return a human-readable name for this stream.""" return self.table_config["table_name"] - + @staticmethod def json_stat2_to_rows(json_stat2): rows = [] dimensions = json_stat2["dimension"] values = json_stat2["value"] dimension_keys = list(dimensions.keys()) - + def recursive_build_row(dim_index, current_row): if dim_index == len(dimension_keys): current_row["value"] = values[len(rows)] @@ -87,31 +84,20 @@ def parse_response(self, response: requests.Response) -> Iterable[dict]: rows = self.json_stat2_to_rows(json_stat2) for i, row in enumerate(rows): - hash_object = hashlib.sha256() - row["_sdc_row_hash"] = self.create_hash_from_dict(row) yield row - def prepare_request_payload( - self, context: dict | None, next_page_token: _TToken | None + self, context: dict | None, next_page_token ) -> dict | None: """Prepare the data payload for the REST API request.""" - base_payload = { - "query": [], - "response": { - "format": "json-stat2" - } - } + base_payload = {"query": [], "response": {"format": "json-stat2"}} for select in self.table_config.get("select", []): column_payload = { "code": select["code"], - "selection": { - "filter": "item", - "values": select["values"] - } + "selection": {"filter": "item", "values": select["values"]}, } base_payload["query"].append(column_payload) @@ -120,11 +106,13 @@ def prepare_request_payload( if not last_time: return base_payload - + self.logger.info("time_items: " + str(self.time_items)) - + if len(self.time_items) == 1: - new_times = [item for item in self.time_items[0]["values"] if item > last_time] + new_times = [ + item for item in self.time_items[0]["values"] if item > last_time + ] self.logger.info("new_times: " + str(new_times)) if not new_times: @@ -132,19 +120,13 @@ def prepare_request_payload( self.logger.info("No new times, fetching latest period") time_payload = { "code": self.time_items[0]["code"], - "selection": { - "filter": "item", - "values": [last_time] - } + "selection": {"filter": "item", "values": [last_time]}, } else: self.logger.info(f"New times found, fetching new times {new_times}") time_payload = { "code": self.time_items[0]["code"], - "selection": { - "filter": "item", - "values": new_times - } + "selection": {"filter": "item", "values": new_times}, } base_payload["query"].append(time_payload) @@ -152,12 +134,10 @@ def prepare_request_payload( self.logger.info("payload: " + str(base_payload)) return base_payload - - @cached_property def schema(self) -> th.PropertiesList: - + r = requests.get(self.url_base + self.path) r.raise_for_status() @@ -165,7 +145,7 @@ def schema(self) -> th.PropertiesList: self.time_items = time_variable properties = th.PropertiesList() - + for item in r.json()["variables"]: properties.append( @@ -188,18 +168,8 @@ def schema(self) -> th.PropertiesList: properties.append( th.Property( - "_sdc_row_hash", - th.StringType, - description="Row number", - required=True + "_sdc_row_hash", th.StringType, description="Row number", required=True ) ) return properties.to_dict() - - - - - - - diff --git a/tap_pxwebapi/tap.py b/tap_pxwebapi/tap.py index 77cc536..f9b7b95 100644 --- a/tap_pxwebapi/tap.py +++ b/tap_pxwebapi/tap.py @@ -45,15 +45,15 @@ class Tappxwebapi(Tap): th.Property( "values", th.ArrayType(th.StringType), - ) + ), ) - ) - ) + ), + ), ) ), required=True, description="Tables to read", - ) + ), ).to_dict() def discover_streams(self) -> list[streams.TablesStream]: @@ -64,10 +64,7 @@ def discover_streams(self) -> list[streams.TablesStream]: """ for table in self.config["tables"]: - yield streams.TablesStream( - tap=self, - table_config=table - ) + yield streams.TablesStream(tap=self, table_config=table) if __name__ == "__main__": diff --git a/tests/data_responses/13861.json b/tests/data_responses/13861.json new file mode 100644 index 0000000..5c133f3 --- /dev/null +++ b/tests/data_responses/13861.json @@ -0,0 +1,212 @@ +{ + "version": "2.0", + "class": "dataset", + "label": "13861: Lønnsfordelingen belyst ved ulikhetsmålene Gini-koeffisient og P90/P10, etter næring (SN2007), statistikkvariabel og år", + "source": "Statistisk sentralbyrå", + "updated": "2023-02-02T07:00:00Z", + "note": [ + "Vi kan ikke oppgi tall for næringene «T Lønnet arbeid i private husholdninger», «U Internasjonale organisasjoner og organer» og «Z Ufordelt næring» grunnet få observasjoner." + ], + "role": { + "time": [ + "Tid" + ], + "metric": [ + "ContentsCode" + ] + }, + "id": [ + "NACE2007", + "ContentsCode", + "Tid" + ], + "size": [ + 4, + 2, + 7 + ], + "dimension": { + "NACE2007": { + "label": "næring (SN2007)", + "category": { + "index": { + "A-Z": 0, + "A": 1, + "B": 2, + "C": 3 + }, + "label": { + "A-Z": "Alle næringer", + "A": "Jordbruk, skogbruk og fiske", + "B": "Bergverksdrift og utvinning", + "C": "Industri" + } + }, + "extension": { + "elimination": true, + "eliminationValueCode": "A-Z", + "show": "code_value" + }, + "link": { + "describedby": [ + { + "extension": { + "NACE2007": "urn:ssb:classification:klass:6" + } + } + ] + } + }, + "ContentsCode": { + "label": "statistikkvariabel", + "category": { + "index": { + "Ginikoeffisient": 0, + "P90P10": 1 + }, + "label": { + "Ginikoeffisient": "Ginikoeffisient", + "P90P10": "P90/P10" + }, + "note": { + "Ginikoeffisient": [ + "Gini-koeffisienten er et summarisk mål som varierer fra 0 (minst ulikhet) og 1 (størst ulikhet)." + ], + "P90P10": [ + "P90/P10 viser til forholdstallet mellom lønnen til den personen som befinner seg mellom desil 9 og desil 10 (P90) og lønnen til den personen som befinner seg mellom desil 1 og desil 2 (P10)." + ] + }, + "unit": { + "Ginikoeffisient": { + "base": "ginikoeffisient", + "decimals": 3 + }, + "P90P10": { + "base": "forholdstall", + "decimals": 1 + } + } + }, + "extension": { + "elimination": false, + "refperiod": { + "Ginikoeffisient": "", + "P90P10": "" + }, + "show": "value" + } + }, + "Tid": { + "label": "år", + "category": { + "index": { + "2016": 0, + "2017": 1, + "2018": 2, + "2019": 3, + "2020": 4, + "2021": 5, + "2022": 6 + }, + "label": { + "2016": "2016", + "2017": "2017", + "2018": "2018", + "2019": "2019", + "2020": "2020", + "2021": "2021", + "2022": "2022" + } + }, + "extension": { + "elimination": false, + "show": "code" + } + } + }, + "extension": { + "px": { + "infofile": "None", + "tableid": "13861", + "decimals": 1, + "official-statistics": true, + "aggregallowed": false, + "language": "no", + "matrix": "Ginikoeffisient", + "subject-code": "al" + }, + "contact": [ + { + "name": "Håkon Grini", + "phone": "482 05 163", + "mail": "gri@ssb.no", + "raw": "Håkon Grini, Statistisk sentralbyrå# +47 482 05 163#gri@ssb.no" + }, + { + "name": "Knut Snellingen Bye", + "phone": "408 11 326", + "mail": "ksb@ssb.no", + "raw": "Knut Snellingen Bye, Statistisk sentralbyrå# +47 408 11 326#ksb@ssb.no" + } + ] + }, + "value": [ + 0.205, + 0.204, + 0.206, + 0.206, + 0.206, + 0.209, + 0.211, + 2.3, + 2.3, + 2.3, + 2.3, + 2.3, + 2.3, + 2.3, + 0.215, + 0.224, + 0.221, + 0.221, + 0.227, + 0.228, + 0.232, + 2.4, + 2.5, + 2.6, + 2.6, + 2.6, + 2.7, + 2.7, + 0.226, + 0.219, + 0.227, + 0.226, + 0.222, + 0.221, + 0.233, + 2.7, + 2.6, + 2.7, + 2.7, + 2.6, + 2.6, + 2.8, + 0.205, + 0.203, + 0.205, + 0.204, + 0.202, + 0.204, + 0.207, + 2.3, + 2.2, + 2.3, + 2.3, + 2.2, + 2.3, + 2.3 + ] + } + \ No newline at end of file diff --git a/tests/schema_responses/13861.json b/tests/schema_responses/13861.json new file mode 100644 index 0000000..f02bf1d --- /dev/null +++ b/tests/schema_responses/13861.json @@ -0,0 +1,110 @@ +{ + "title": "13861: Lønnsfordelingen belyst ved ulikhetsmålene Gini-koeffisient og P90/P10, etter næring (SN2007), kjønn, statistikkvariabel og år", + "variables": [ + { + "code": "NACE2007", + "text": "næring (SN2007)", + "values": [ + "A-Z", + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "J", + "K", + "L", + "M", + "N", + "O", + "P", + "Q", + "R", + "S", + "T", + "U", + "Z" + ], + "valueTexts": [ + "Alle næringer", + "Jordbruk, skogbruk og fiske", + "Bergverksdrift og utvinning", + "Industri", + "Elektrisitets-, gass-, damp- og varmtvannsforsyning", + "Vann, avløp, renovasjon", + "Bygge- og anleggsvirksomhet", + "Varehandel, reparasjon av motorvogner", + "Transport og lagring", + "Overnattings- og serveringsvirksomhet", + "Informasjon og kommunikasjon", + "Finansierings- og forsikringsvirksomhet", + "Omsetning og drift av fast eiendom", + "Faglig, vitenskapelig og teknisk tjenesteyting", + "Forretningsmessig tjenesteyting", + "Offentlig administrasjon og forsvar, og trygdeordninger underlagt offentlig forvaltning", + "Undervisning", + "Helse- og sosialtjenester", + "Kultur, underholdning og fritid", + "Annen tjenesteyting", + "Lønnet arbeid i private husholdninger", + "Internasjonale organisasjoner og organer", + "Ufordelt næring" + ], + "elimination": true + }, + { + "code": "Kjonn", + "text": "kjønn", + "values": [ + "0", + "1", + "2" + ], + "valueTexts": [ + "Begge kjønn", + "Menn", + "Kvinner" + ], + "elimination": true + }, + { + "code": "ContentsCode", + "text": "statistikkvariabel", + "values": [ + "Ginikoeffisient", + "P90P10" + ], + "valueTexts": [ + "Ginikoeffisient", + "P90/P10" + ] + }, + { + "code": "Tid", + "text": "år", + "values": [ + "2016", + "2017", + "2018", + "2019", + "2020", + "2021", + "2022" + ], + "valueTexts": [ + "2016", + "2017", + "2018", + "2019", + "2020", + "2021", + "2022" + ], + "time": true + } + ] + } \ No newline at end of file diff --git a/tests/test_core.py b/tests/test_core.py deleted file mode 100644 index 7a7f86a..0000000 --- a/tests/test_core.py +++ /dev/null @@ -1,22 +0,0 @@ -"""Tests standard tap features using the built-in SDK tests library.""" - -import datetime - -from singer_sdk.testing import get_tap_test_class - -from tap_pxwebapi.tap import Tappxwebapi - -SAMPLE_CONFIG = { - "start_date": datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%d"), - # TODO: Initialize minimal tap config -} - - -# Run standard built-in tap tests from the SDK: -TestTappxwebapi = get_tap_test_class( - tap_class=Tappxwebapi, - config=SAMPLE_CONFIG, -) - - -# TODO: Create additional tests as appropriate for your tap. diff --git a/tests/test_sync.py b/tests/test_sync.py new file mode 100644 index 0000000..e18fd24 --- /dev/null +++ b/tests/test_sync.py @@ -0,0 +1,81 @@ +import json +import responses +from tap_pxwebapi.tap import Tappxwebapi +import re +SAMPLE_CONFIG = { + "tables": [ + { + "table_name": "test_table", + "table_id": "13861", + "select": [ + { + "code": "NACE2007", + "values": ["A-Z","A","B","C"] + } + ] + } + ], +} + +SAMPLE_REQUEST = { + "query": [ + { + "code": "NACE2007", + "selection": { + "filter": "item", + "values": ["A-Z","A","B","C"] + } + } + ], + "response": { + "format": "json-stat2" + }, + } + +SCHEMA_RESPONSE_TXT = open( + "tests/schema_responses/13861.json" +).read() + +DATA_RESPONSE_TXT = open( + "tests/data_responses/13861.json" +).read() + +SCHEMA_RESPONSE = json.loads(SCHEMA_RESPONSE_TXT) +DATA_RESPONSE = json.loads(DATA_RESPONSE_TXT) + +def validate_request(request): + jbod = request.body.decode("utf-8") + + assert jbod == json.dumps(SAMPLE_REQUEST) + return (200, {}, DATA_RESPONSE_TXT) + +@responses.activate +def test_stuff(capsys): + + responses.add_callback( + responses.POST, + re.compile(r"https://data.ssb.no/api/v0/en/table/13861"), + callback=validate_request, + ) + + schema = responses.add( + responses.GET, + "https://data.ssb.no/api/v0/en/table/13861", + json=SCHEMA_RESPONSE, + status=200 + ) + + tap1 = Tappxwebapi(config=SAMPLE_CONFIG) + _ = tap1.streams["test_table"].sync(None) + + captured = capsys.readouterr() + all_stdout = captured.out.strip() + stdout_parts = all_stdout.split("\n") + + json_messages = [json.loads(line) for line in stdout_parts] + data_records = [msg for msg in json_messages if msg.get("type") == "RECORD"] + schema_records = [msg for msg in json_messages if msg.get("type") == "SCHEMA"] + + assert len(data_records) == 4*2*7 + assert len(schema_records) == 1 + assert schema.call_count == 1 diff --git a/tox.ini b/tox.ini index 70b9e4a..79da51a 100644 --- a/tox.ini +++ b/tox.ini @@ -17,3 +17,28 @@ envlist = py37, py38, py39, py310, py311 commands = poetry install -v poetry run pytest + + +[testenv:format] +# Attempt to auto-resolve lint errors before they are raised. +# To execute, run `tox -e format` +commands = + poetry install -v + poetry run black tap_pxwebapi/ + poetry run isort tap_pxwebapi + +[testenv:lint] +# Raise an error if lint and style standards are not met. +# To execute, run `tox -e lint` +commands = + poetry install -v + poetry run black --diff tap_pxwebapi/ + poetry run isort --check tap_pxwebapi + poetry run flake8 tap_pxwebapi + # refer to mypy.ini for specific settings + ; poetry run mypy tap_pxwebapi --exclude='tap_pxwebapi/tests' + +[flake8] +ignore = W503 +max-line-length = 120 +max-complexity = 10