add check for ListRecords verb requests with pagination

StabiBerlin · Jun 1, 2023 · 6c63ddc · 6c63ddc
1 parent e8db9d5
commit 6c63ddc
Show file tree

Hide file tree

Showing 4 changed files with 158 additions and 2 deletions.
diff --git a/.github/workflows/status.yml b/.github/workflows/status.yml
@@ -4,7 +4,8 @@ name: status
 
 on:
   schedule:
-  - cron: '0 */6 * * *'
+    - cron: '0 */6 * * *'
+  workflow_dispatch:
 
 
 jobs:
@@ -24,5 +25,7 @@ jobs:
       - name: check status of OAI-PMH service
         run: oai-status
 
+      - name: retrieve all records in OAI set
+        run: oai-list-records illustrierte.liedflugschriften
+
 ...
-
diff --git a/oai_status/list_records.py b/oai_status/list_records.py
@@ -0,0 +1,100 @@
+import sys
+from typing import Iterable
+
+from delb import Document, TagNode
+
+
+def request_list_records(
+    setSpec: str, metadata_prefix: str = 'oai_dc', resumption_token: str = None
+) -> Document:
+    src = 'https://oai.sbb.berlin/?verb=ListRecords'
+    if resumption_token:
+        src += f'&resumptionToken={resumption_token}'
+    else:
+        src += f'&metadataPrefix={metadata_prefix}&set={setSpec}'
+    return Document(src)
+
+
+def get_records(doc: Document) -> Iterable[TagNode]:
+    records = doc.xpath('//ListRecords/record')
+    yield from records
+
+
+def get_resumption_token(doc: Document) -> str:
+    '''
+    >>> get_resumption_token(request_list_records('illustrierte.liedflugschriften'))
+    'metadataPrefix%3Doai_dc%26set%3Dillustrierte.liedflugschriften%26cursor%3D50%26batch_size%3D51'
+
+    '''
+    token_node = doc.xpath('//resumptionToken').first
+    if token_node:
+        return token_node.full_text
+    return None
+
+
+def extract_dc_bibl_data(record: TagNode, *fields: list[str]) -> list[str]:
+    '''
+    extract bibliographic data from the dublin core namespace of an OAI-PMH record element.
+
+    >>> tagnode = Document(
+    ...     '<record><metadata><oai_dc:dc '
+    ...     'xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" '
+    ...     'xmlns:dc="http://purl.org/dc/elements/1.1/">'
+    ...     '<dc:publisher>Gutknecht, Friedrich</dc:publisher>'
+    ...     '<dc:creator>Böschenstein, Johann</dc:creator></oai_dc:dc></metadata></record>'
+    ... ).root
+
+    >>> extract_dc_bibl_data(tagnode, 'creator', 'publisher', 'date')
+    ['Böschenstein, Johann', 'Gutknecht, Friedrich', '']
+
+    '''
+    def matching_nodes(field: str) -> list[TagNode]:
+        return list(record.iterate_descendants(
+            lambda descendant: (
+                isinstance(descendant, TagNode) and descendant.universal_name.endswith(field)
+            )
+        ))
+    return [
+        matches[0].full_text if matches else ''
+        for matches in map(matching_nodes, fields)
+    ]
+
+
+def list_records(
+    setSpec: str, metadata_prefix: str = 'oai_dc', limit: int = -1
+) -> Iterable[TagNode]:
+    '''
+    retrieve records of a certain set from OAI endpoint, i.e. make requests with the `ListRecords`
+    verb.
+    '''
+    counter = type('counter', (), dict(left=limit))()
+
+    def yield_records(doc: Document) -> Iterable[TagNode]:
+        for record in get_records(doc):
+            if not counter.left:
+                return
+            yield record
+            counter.left -= 1
+
+    doc = request_list_records(setSpec, metadata_prefix)
+    yield from yield_records(doc)
+    if not counter.left:
+        return
+    while resumption_token := get_resumption_token(doc):
+        doc = request_list_records(setSpec, metadata_prefix, resumption_token)
+        yield from yield_records(doc)
+        if not counter.left:
+            return
+
+
+def main(argv: list[str] = sys.argv, limit: int = -1):
+    setSpec = argv[-1] if len(argv) > 1 else 'illustrierte.liedflugschriften'
+    results = []
+    for row in map(
+        lambda record: extract_dc_bibl_data(record, 'date', 'publisher', 'coverage', 'creator'),
+        list_records(setSpec, limit=limit)
+    ):
+        print(', '.join(row))
+        results += [row]
+    if 'unittest' in sys.modules:
+        return results
diff --git a/pyproject.toml b/pyproject.toml
@@ -21,6 +21,7 @@ packages = [
 
 [project.scripts]
 oai-status = 'oai_status:main'
+oai-list-records = 'oai_status.list_records:main'
 
 [tool.pytest.ini_options]
 addopts = '''

diff --git a/tests/test_list_records.py b/tests/test_list_records.py
@@ -0,0 +1,52 @@
+from urllib import parse
+
+from oai_status.list_records import (
+    get_resumption_token,
+    list_records,
+    main,
+    request_list_records,
+)
+
+
+def test_request_list_records_resumption():
+    doc = request_list_records(
+        'illustrierte.liedflugschriften',
+        resumption_token=(
+            'metadataPrefix%3Doai_dc%26set%3Dillustrierte.liedflugschriften'
+            '%26cursor%3D50%26batch_size%3D51'
+        )
+    )
+    assert (resumption_token := get_resumption_token(doc))
+    resumption_data = parse.parse_qs(parse.unquote(resumption_token))
+    assert resumption_data == {
+        'metadataPrefix': ['oai_dc'],
+        'set': ['illustrierte.liedflugschriften'],
+        'cursor': ['100'],
+        'batch_size': ['51']
+    }
+
+
+def test_list_records_paging():
+    assert len(list(list_records('reformation'))) == 73
+
+
+def test_list_records_limited():
+    assert len(list(list_records('reformation', limit=1))) == 1
+
+
+def test_list_records_limited_paging():
+    records = list(list_records('illustrierte.liedflugschriften', limit=125))
+    assert len(records) == 125
+
+
+def test_entrypoint():
+    csv_data = main(
+        ['', 'reformation'], 10
+    )
+    assert len(csv_data) == 10
+    assert all(
+        map(
+            lambda row: row[0] < '1831',
+            csv_data
+        )
+    )