Skip to content

Commit

Permalink
add check for ListRecords verb requests with pagination
Browse files Browse the repository at this point in the history
  • Loading branch information
JKatzwinkel committed Jun 1, 2023
1 parent e8db9d5 commit 6c63ddc
Show file tree
Hide file tree
Showing 4 changed files with 158 additions and 2 deletions.
7 changes: 5 additions & 2 deletions .github/workflows/status.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ name: status

on:
schedule:
- cron: '0 */6 * * *'
- cron: '0 */6 * * *'
workflow_dispatch:


jobs:
Expand All @@ -24,5 +25,7 @@ jobs:
- name: check status of OAI-PMH service
run: oai-status

- name: retrieve all records in OAI set
run: oai-list-records illustrierte.liedflugschriften

...

100 changes: 100 additions & 0 deletions oai_status/list_records.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import sys
from typing import Iterable

from delb import Document, TagNode


def request_list_records(
setSpec: str, metadata_prefix: str = 'oai_dc', resumption_token: str = None
) -> Document:
src = 'https://oai.sbb.berlin/?verb=ListRecords'
if resumption_token:
src += f'&resumptionToken={resumption_token}'
else:
src += f'&metadataPrefix={metadata_prefix}&set={setSpec}'
return Document(src)


def get_records(doc: Document) -> Iterable[TagNode]:
records = doc.xpath('//ListRecords/record')
yield from records


def get_resumption_token(doc: Document) -> str:
'''
>>> get_resumption_token(request_list_records('illustrierte.liedflugschriften'))
'metadataPrefix%3Doai_dc%26set%3Dillustrierte.liedflugschriften%26cursor%3D50%26batch_size%3D51'
'''
token_node = doc.xpath('//resumptionToken').first
if token_node:
return token_node.full_text
return None


def extract_dc_bibl_data(record: TagNode, *fields: list[str]) -> list[str]:
'''
extract bibliographic data from the dublin core namespace of an OAI-PMH record element.
>>> tagnode = Document(
... '<record><metadata><oai_dc:dc '
... 'xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" '
... 'xmlns:dc="http://purl.org/dc/elements/1.1/">'
... '<dc:publisher>Gutknecht, Friedrich</dc:publisher>'
... '<dc:creator>Böschenstein, Johann</dc:creator></oai_dc:dc></metadata></record>'
... ).root
>>> extract_dc_bibl_data(tagnode, 'creator', 'publisher', 'date')
['Böschenstein, Johann', 'Gutknecht, Friedrich', '']
'''
def matching_nodes(field: str) -> list[TagNode]:
return list(record.iterate_descendants(
lambda descendant: (
isinstance(descendant, TagNode) and descendant.universal_name.endswith(field)
)
))
return [
matches[0].full_text if matches else ''
for matches in map(matching_nodes, fields)
]


def list_records(
setSpec: str, metadata_prefix: str = 'oai_dc', limit: int = -1
) -> Iterable[TagNode]:
'''
retrieve records of a certain set from OAI endpoint, i.e. make requests with the `ListRecords`
verb.
'''
counter = type('counter', (), dict(left=limit))()

def yield_records(doc: Document) -> Iterable[TagNode]:
for record in get_records(doc):
if not counter.left:
return
yield record
counter.left -= 1

doc = request_list_records(setSpec, metadata_prefix)
yield from yield_records(doc)
if not counter.left:
return
while resumption_token := get_resumption_token(doc):
doc = request_list_records(setSpec, metadata_prefix, resumption_token)
yield from yield_records(doc)
if not counter.left:
return


def main(argv: list[str] = sys.argv, limit: int = -1):
setSpec = argv[-1] if len(argv) > 1 else 'illustrierte.liedflugschriften'
results = []
for row in map(
lambda record: extract_dc_bibl_data(record, 'date', 'publisher', 'coverage', 'creator'),
list_records(setSpec, limit=limit)
):
print(', '.join(row))
results += [row]
if 'unittest' in sys.modules:
return results
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ packages = [

[project.scripts]
oai-status = 'oai_status:main'
oai-list-records = 'oai_status.list_records:main'

[tool.pytest.ini_options]
addopts = '''
Expand Down
52 changes: 52 additions & 0 deletions tests/test_list_records.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from urllib import parse

from oai_status.list_records import (
get_resumption_token,
list_records,
main,
request_list_records,
)


def test_request_list_records_resumption():
doc = request_list_records(
'illustrierte.liedflugschriften',
resumption_token=(
'metadataPrefix%3Doai_dc%26set%3Dillustrierte.liedflugschriften'
'%26cursor%3D50%26batch_size%3D51'
)
)
assert (resumption_token := get_resumption_token(doc))
resumption_data = parse.parse_qs(parse.unquote(resumption_token))
assert resumption_data == {
'metadataPrefix': ['oai_dc'],
'set': ['illustrierte.liedflugschriften'],
'cursor': ['100'],
'batch_size': ['51']
}


def test_list_records_paging():
assert len(list(list_records('reformation'))) == 73


def test_list_records_limited():
assert len(list(list_records('reformation', limit=1))) == 1


def test_list_records_limited_paging():
records = list(list_records('illustrierte.liedflugschriften', limit=125))
assert len(records) == 125


def test_entrypoint():
csv_data = main(
['', 'reformation'], 10
)
assert len(csv_data) == 10
assert all(
map(
lambda row: row[0] < '1831',
csv_data
)
)

0 comments on commit 6c63ddc

Please sign in to comment.