-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add check for ListRecords verb requests with pagination
- Loading branch information
1 parent
e8db9d5
commit 6c63ddc
Showing
4 changed files
with
158 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
import sys | ||
from typing import Iterable | ||
|
||
from delb import Document, TagNode | ||
|
||
|
||
def request_list_records( | ||
setSpec: str, metadata_prefix: str = 'oai_dc', resumption_token: str = None | ||
) -> Document: | ||
src = 'https://oai.sbb.berlin/?verb=ListRecords' | ||
if resumption_token: | ||
src += f'&resumptionToken={resumption_token}' | ||
else: | ||
src += f'&metadataPrefix={metadata_prefix}&set={setSpec}' | ||
return Document(src) | ||
|
||
|
||
def get_records(doc: Document) -> Iterable[TagNode]: | ||
records = doc.xpath('//ListRecords/record') | ||
yield from records | ||
|
||
|
||
def get_resumption_token(doc: Document) -> str: | ||
''' | ||
>>> get_resumption_token(request_list_records('illustrierte.liedflugschriften')) | ||
'metadataPrefix%3Doai_dc%26set%3Dillustrierte.liedflugschriften%26cursor%3D50%26batch_size%3D51' | ||
''' | ||
token_node = doc.xpath('//resumptionToken').first | ||
if token_node: | ||
return token_node.full_text | ||
return None | ||
|
||
|
||
def extract_dc_bibl_data(record: TagNode, *fields: list[str]) -> list[str]: | ||
''' | ||
extract bibliographic data from the dublin core namespace of an OAI-PMH record element. | ||
>>> tagnode = Document( | ||
... '<record><metadata><oai_dc:dc ' | ||
... 'xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" ' | ||
... 'xmlns:dc="http://purl.org/dc/elements/1.1/">' | ||
... '<dc:publisher>Gutknecht, Friedrich</dc:publisher>' | ||
... '<dc:creator>Böschenstein, Johann</dc:creator></oai_dc:dc></metadata></record>' | ||
... ).root | ||
>>> extract_dc_bibl_data(tagnode, 'creator', 'publisher', 'date') | ||
['Böschenstein, Johann', 'Gutknecht, Friedrich', ''] | ||
''' | ||
def matching_nodes(field: str) -> list[TagNode]: | ||
return list(record.iterate_descendants( | ||
lambda descendant: ( | ||
isinstance(descendant, TagNode) and descendant.universal_name.endswith(field) | ||
) | ||
)) | ||
return [ | ||
matches[0].full_text if matches else '' | ||
for matches in map(matching_nodes, fields) | ||
] | ||
|
||
|
||
def list_records( | ||
setSpec: str, metadata_prefix: str = 'oai_dc', limit: int = -1 | ||
) -> Iterable[TagNode]: | ||
''' | ||
retrieve records of a certain set from OAI endpoint, i.e. make requests with the `ListRecords` | ||
verb. | ||
''' | ||
counter = type('counter', (), dict(left=limit))() | ||
|
||
def yield_records(doc: Document) -> Iterable[TagNode]: | ||
for record in get_records(doc): | ||
if not counter.left: | ||
return | ||
yield record | ||
counter.left -= 1 | ||
|
||
doc = request_list_records(setSpec, metadata_prefix) | ||
yield from yield_records(doc) | ||
if not counter.left: | ||
return | ||
while resumption_token := get_resumption_token(doc): | ||
doc = request_list_records(setSpec, metadata_prefix, resumption_token) | ||
yield from yield_records(doc) | ||
if not counter.left: | ||
return | ||
|
||
|
||
def main(argv: list[str] = sys.argv, limit: int = -1): | ||
setSpec = argv[-1] if len(argv) > 1 else 'illustrierte.liedflugschriften' | ||
results = [] | ||
for row in map( | ||
lambda record: extract_dc_bibl_data(record, 'date', 'publisher', 'coverage', 'creator'), | ||
list_records(setSpec, limit=limit) | ||
): | ||
print(', '.join(row)) | ||
results += [row] | ||
if 'unittest' in sys.modules: | ||
return results |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
from urllib import parse | ||
|
||
from oai_status.list_records import ( | ||
get_resumption_token, | ||
list_records, | ||
main, | ||
request_list_records, | ||
) | ||
|
||
|
||
def test_request_list_records_resumption(): | ||
doc = request_list_records( | ||
'illustrierte.liedflugschriften', | ||
resumption_token=( | ||
'metadataPrefix%3Doai_dc%26set%3Dillustrierte.liedflugschriften' | ||
'%26cursor%3D50%26batch_size%3D51' | ||
) | ||
) | ||
assert (resumption_token := get_resumption_token(doc)) | ||
resumption_data = parse.parse_qs(parse.unquote(resumption_token)) | ||
assert resumption_data == { | ||
'metadataPrefix': ['oai_dc'], | ||
'set': ['illustrierte.liedflugschriften'], | ||
'cursor': ['100'], | ||
'batch_size': ['51'] | ||
} | ||
|
||
|
||
def test_list_records_paging(): | ||
assert len(list(list_records('reformation'))) == 73 | ||
|
||
|
||
def test_list_records_limited(): | ||
assert len(list(list_records('reformation', limit=1))) == 1 | ||
|
||
|
||
def test_list_records_limited_paging(): | ||
records = list(list_records('illustrierte.liedflugschriften', limit=125)) | ||
assert len(records) == 125 | ||
|
||
|
||
def test_entrypoint(): | ||
csv_data = main( | ||
['', 'reformation'], 10 | ||
) | ||
assert len(csv_data) == 10 | ||
assert all( | ||
map( | ||
lambda row: row[0] < '1831', | ||
csv_data | ||
) | ||
) |