From 6370cb3870c70ac9d27e6c7326fff729bfdb4986 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Sat, 17 Feb 2024 23:13:02 -0800 Subject: [PATCH] standardize on markdown format with optional frontmatter --- airbyte/documents.py | 153 +++++++++++++++++++++++++++++++++---------- 1 file changed, 120 insertions(+), 33 deletions(-) diff --git a/airbyte/documents.py b/airbyte/documents.py index d30ffd31..b85ce8c1 100644 --- a/airbyte/documents.py +++ b/airbyte/documents.py @@ -2,21 +2,50 @@ This module is modeled after the LangChain project's `Documents` class: - https://github.com/langchain-ai/langchain/blob/master/libs/core/langchain_core/documents/base.py + +To inform how to render a specific stream's records as documents, this implementation proposes that +sources define a `document_rendering` annotation in their JSON schema. This property would contain +instructions for how to render records as documents, such as which properties to render as content, +which properties to render as metadata, and which properties to render as annotations. + +Assuming a stream like GitHub Issues, the `document_rendering` annotation might look like this: +```json +{ + "airbyte_document_rendering": { + "title_property": "title", + "content_properties": ["body"], + "frontmatter_properties": ["url", "author"], + "metadata_properties": ["id", "created_at", "updated_at", "url"] + } +} +``` + +Note that the `document_rendering` annotation is optional. """ from __future__ import annotations import textwrap +from collections import OrderedDict from typing import TYPE_CHECKING, Any +import yaml from pydantic import BaseModel -MAX_SINGLE_LINE_LENGTH = 60 - if TYPE_CHECKING: import datetime from collections.abc import Iterable + from airbyte_protocol.models import ConfiguredAirbyteStream + + +MAX_SINGLE_LINE_LENGTH = 60 +AIRBYTE_DOCUMENT_RENDERING = "airbyte_document_rendering" +TITLE_PROPERTY = "title_property" +CONTENT_PROPS = "content_properties" +FRONTMATTER_PROPS = "frontmatter_properties" +METADATA_PROPERTIES = "metadata_properties" + def _to_title_case(name: str, /) -> str: """Convert a string to title case. @@ -49,53 +78,111 @@ class Document(BaseModel): last_modified: datetime.datetime | None = None @classmethod - def from_records(cls, records: Iterable[dict[str, Any]]) -> Iterable[Document]: + def from_records( + cls, + records: Iterable[dict[str, Any]], + stream_metadata: ConfiguredAirbyteStream, + ) -> Iterable[Document]: """Create an iterable of documents from an iterable of records.""" - yield from {cls.from_record(record) for record in records} + yield from { + cls.from_record(record=record, stream_metadata=stream_metadata) for record in records + } @classmethod - def from_record(cls, record: dict[str, Any]) -> Document: + def from_record( + cls, + record: dict[str, Any], + stream_metadata: ConfiguredAirbyteStream, + ) -> Document: """Create a document from a record. + The document will be rendered as a markdown document, with content, frontmatter, and an + optional title. If there are multiple properties to render as content, they will be rendered + beneath H2 section headers. If there is only one property to render as content, it will be + rendered without a section header. If a title property is specified, it will be rendered as + an H1 header at the top of the document. + + If metadata properties are not specified, then they will default to those properties which + are not specified as content, title, or frontmatter properties. Metadata properties are + not rendered in the document, but are carried with the document in a separate dictionary + object. + TODO: - - Parse 'id' from primary key records, if available. Otherwise hash the record data. - - Parse 'last_modified' from the record, when available. - - Add a convention to let the source define how 'content' should be rendered. In - that case, the default rendering behavior below would become the fallback. - - Add smarter logic for deciding which fields are metadata and which are content. In this - first version, we assume that all string fields are content and all other fields are - metadata - which doesn't work well for URLs, IDs, and many other field types. + - Only use cursor_field for 'last_modified' when cursor_field is a timestamp. """ - primary_keys: list[str] = [] # TODO: Get the actual primary keys here. - document_fields: list[str] = [ - property_name for property_name, value in record.values() if isinstance(value, str) - ] - metadata_fields = set(record.keys()) - set(document_fields) + primary_keys: list[str] = [] + all_properties = list(record.keys()) + + # TODO: Let the source define how 'content' should be rendered. In that case, the source + # specifies specific properties to render as properties (at the top of the document) and + # which properties to render as content (in the body of the document). By default, we assume + # that properties not defined as properties or as content are metadata, but this may be + # overridden by the source, for instance in cases of redundancies. + if AIRBYTE_DOCUMENT_RENDERING in stream_metadata.stream.json_schema: + render_instructions = stream_metadata.stream.json_schema[AIRBYTE_DOCUMENT_RENDERING] + if TITLE_PROPERTY in render_instructions: + title_prop: str | None = render_instructions[TITLE_PROPERTY] or None + if CONTENT_PROPS in render_instructions: + content_props: list[str] = render_instructions[CONTENT_PROPS] + if FRONTMATTER_PROPS in render_instructions: + frontmatter_props: list[str] = render_instructions[FRONTMATTER_PROPS] + if METADATA_PROPERTIES in render_instructions: + metadata_props: list[str] = render_instructions[METADATA_PROPERTIES] + else: + title_prop: str | None = None + frontmatter_props: list[str] = [ + property_name + for property_name, value in record.items() + if isinstance(value, str) and len(value) < MAX_SINGLE_LINE_LENGTH + ] + content_props: list[str] = [ + property_name + for property_name in all_properties + if property_name not in frontmatter_props + ] + metadata_props = set(record.keys()) - set(content_props) - set(content_props) + doc_id: str = ( - "-".join(str(record[key]) for key in primary_keys) + "-".join(str(record[key]) for key in stream_metadata.primary_key) if primary_keys else str(hash(record)) ) + if stream_metadata.cursor_field: + last_modified_key = ".".join(stream_metadata.cursor_field) last_modified_key = None # TODO: Get the actual last modified key here, when available. - # Short content is rendered as a single line, while long content is rendered as a indented - # multi-line string with a 100 character width. - content = "\n".join( - f"{_to_title_case(key)}: {value}" - if len(value) < MAX_SINGLE_LINE_LENGTH - else f"{_to_title_case(key)}: \n{textwrap.wrap( - value, - width=100, - initial_indent=' ' * 4, - subsequent_indent=' ' * 4, - break_long_words=False, - )}\n" - for key, value in record.items() - if key in document_fields + content: str = ( + "---\n" + + yaml.dump(OrderedDict((key, record[key]) for key in frontmatter_props)) + + "---\n" ) + if title_prop: + content += f"# {record[title_prop]}\n\n" + + if len(content_props) > 0: + pass + elif len(content_props) == 1: + # Only one property to render as content; no need for section headers. + content += "\n".join( + textwrap.wrap( + record[content_props[0]], + width=100, + break_long_words=False, + ) + ) + else: + # Multiple properties to render as content; use H2 section headers. + content += "\n".join( + f"## {_to_title_case(key)}\n\n{textwrap.wrap( + record[key], + width=100, + break_long_words=False, + )}\n" + for key in content_props + ) return cls( id=doc_id, content=content, - metadata={key: record[key] for key in metadata_fields}, + metadata={key: record[key] for key in metadata_props}, last_modified=record[last_modified_key] if last_modified_key else None, )