Skip to content

Commit

Permalink
feat: Add User Defined Source support (#114)
Browse files Browse the repository at this point in the history
Signed-off-by: Sidhant Kohli <[email protected]>
  • Loading branch information
kohlisid authored Oct 13, 2023
1 parent 7b90d4d commit 71c2b52
Show file tree
Hide file tree
Showing 43 changed files with 2,000 additions and 426 deletions.
3 changes: 1 addition & 2 deletions .codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,4 @@ ignore:
- "pynumaflow/reducer/proto/*"
- "pynumaflow/sourcetransformer/proto/*"
- "pynumaflow/sideinput/proto/*"
- "pynumaflow/map/_udfunction_pb2.pyi"
- "pynumaflow/sink/_udsink_pb2.pyi"
- "pynumaflow/sourcer/proto/*"
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ proto:
python3 -m grpc_tools.protoc -I=pynumaflow/reducer/proto --python_out=pynumaflow/reducer/proto --grpc_python_out=pynumaflow/reducer/proto pynumaflow/reducer/proto/*.proto
python3 -m grpc_tools.protoc -I=pynumaflow/sourcetransformer/proto --python_out=pynumaflow/sourcetransformer/proto --grpc_python_out=pynumaflow/sourcetransformer/proto pynumaflow/sourcetransformer/proto/*.proto
python3 -m grpc_tools.protoc -I=pynumaflow/sideinput/proto --python_out=pynumaflow/sideinput/proto --grpc_python_out=pynumaflow/sideinput/proto pynumaflow/sideinput/proto/*.proto
python3 -m grpc_tools.protoc -I=pynumaflow/sourcer/proto --python_out=pynumaflow/sourcer/proto --grpc_python_out=pynumaflow/sourcer/proto pynumaflow/sourcer/proto/*.proto


sed -i '' 's/^\(import.*_pb2\)/from . \1/' pynumaflow/*/proto/*.py
54 changes: 54 additions & 0 deletions examples/source/simple-source/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
####################################################################################################
# builder: install needed dependencies
####################################################################################################

FROM python:3.10-slim-bullseye AS builder

ENV PYTHONFAULTHANDLER=1 \
PYTHONUNBUFFERED=1 \
PYTHONHASHSEED=random \
PIP_NO_CACHE_DIR=on \
PIP_DISABLE_PIP_VERSION_CHECK=on \
PIP_DEFAULT_TIMEOUT=100 \
POETRY_VERSION=1.2.2 \
POETRY_HOME="/opt/poetry" \
POETRY_VIRTUALENVS_IN_PROJECT=true \
POETRY_NO_INTERACTION=1 \
PYSETUP_PATH="/opt/pysetup" \
VENV_PATH="/opt/pysetup/.venv"

ENV PATH="$POETRY_HOME/bin:$VENV_PATH/bin:$PATH"

RUN apt-get update \
&& apt-get install --no-install-recommends -y \
curl \
wget \
# deps for building python deps
build-essential \
&& apt-get install -y git \
&& apt-get clean && rm -rf /var/lib/apt/lists/* \
\
# install dumb-init
&& wget -O /dumb-init https://github.com/Yelp/dumb-init/releases/download/v1.2.5/dumb-init_1.2.5_x86_64 \
&& chmod +x /dumb-init \
&& curl -sSL https://install.python-poetry.org | python3 -

####################################################################################################
# udf: used for running the udf vertices
####################################################################################################
FROM builder AS udf

WORKDIR $PYSETUP_PATH
COPY pyproject.toml ./
RUN poetry install --no-cache --no-root && \
rm -rf ~/.cache/pypoetry/

ADD . /app
WORKDIR /app

RUN chmod +x entry.sh

ENTRYPOINT ["/dumb-init", "--"]
CMD ["/app/entry.sh"]

EXPOSE 5000
8 changes: 8 additions & 0 deletions examples/source/simple-source/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
.PHONY: image
image:
docker build -t "quay.io/numaio/numaflow-python/simple-source:v0.5.3" .
# Github CI runner uses platform linux/amd64. If your local environment don't, the image built by command above might not work
# under the CI E2E test environment.
# To build an image that supports multiple platforms(linux/amd64,linux/arm64) and push to quay.io, use the following command
# docker buildx build -t "quay.io/numaio/numaflow-python/simple-source:v0.5.3" --platform linux/amd64,linux/arm64 . --push
# If command failed, refer to https://billglover.me/notes/build-multi-arch-docker-images/ to fix
6 changes: 6 additions & 0 deletions examples/source/simple-source/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Example Python User Defined Source
A simple example of a user-defined source. The source maintains an array of messages and implements the Read,
Ack, and Pending methods.
The Read(x) method returns the next x number of messages in the array.
The Ack() method acknowledges the last batch of messages returned by Read().
The Pending() method returns 0 to indicate that the simple source always has 0 pending messages.
4 changes: 4 additions & 0 deletions examples/source/simple-source/entry.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/sh
set -eux

python example.py
67 changes: 67 additions & 0 deletions examples/source/simple-source/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from collections.abc import Iterable
from datetime import datetime

from pynumaflow.sourcer import (
ReadRequest,
Message,
Sourcer,
AckRequest,
PendingResponse,
Offset,
)


class SimpleSource:
"""
SimpleSource is a class for User Defined Source implementation.
"""

def __init__(self):
"""
to_ack_set: Set to maintain a track of the offsets yet to be acknowledged
read_idx : the offset idx till where the messages have been read
"""
self.to_ack_set = set()
self.read_idx = 0

def read_handler(self, datum: ReadRequest) -> Iterable[Message]:
"""
read_handler is used to read the data from the source and send the data forward
for each read request we process num_records and increment the read_idx to indicate that
the message has been read and the same is added to the ack set
"""
if self.to_ack_set:
return

for x in range(datum.num_records):
yield Message(
payload=str(self.read_idx).encode(),
offset=Offset(offset=str(self.read_idx).encode(), partition_id="0"),
event_time=datetime.now(),
)
self.to_ack_set.add(str(self.read_idx))
self.read_idx += 1

def ack_handler(self, ack_request: AckRequest):
"""
The ack handler is used acknowledge the offsets that have been read, and remove them
from the to_ack_set
"""
for offset in ack_request.offset:
self.to_ack_set.remove(str(offset.offset, "utf-8"))

def pending_handler(self) -> PendingResponse:
"""
The simple source always returns zero to indicate there is no pending record.
"""
return PendingResponse(count=0)


if __name__ == "__main__":
ud_source = SimpleSource()
grpc_server = Sourcer(
read_handler=ud_source.read_handler,
ack_handler=ud_source.ack_handler,
pending_handler=ud_source.pending_handler,
)
grpc_server.start()
21 changes: 21 additions & 0 deletions examples/source/simple-source/pipeline-numaflow.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
apiVersion: numaflow.numaproj.io/v1alpha1
kind: Pipeline
metadata:
name: simple-source
spec:
vertices:
- name: in
source:
udsource:
container:
# A simple user-defined source for e2e testing
image: quay.io/numaio/numaflow-python/simple-source:v0.5.3
imagePullPolicy: Always
limits:
readBatchSize: 2
- name: out
sink:
log: {}
edges:
- from: in
to: out
16 changes: 16 additions & 0 deletions examples/source/simple-source/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[tool.poetry]
name = "simple-source"
version = "0.2.4"
description = ""
authors = ["Numaflow developers"]

[tool.poetry.dependencies]
python = "~3.10"
pynumaflow = "~0.5.3"


[tool.poetry.dev-dependencies]

[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
1 change: 1 addition & 0 deletions pynumaflow/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
MULTIPROC_MAP_SOCK_PORT = 55551
MULTIPROC_MAP_SOCK_ADDR = "0.0.0.0"
SIDE_INPUT_SOCK_PATH = "/var/run/numaflow/sideinput.sock"
SOURCE_SOCK_PATH = "/var/run/numaflow/source.sock"

# TODO: need to make sure the DATUM_KEY value is the same as
# https://github.com/numaproj/numaflow-go/blob/main/pkg/function/configs.go#L6
Expand Down
57 changes: 0 additions & 57 deletions pynumaflow/mapper/_udfunction_pb2.pyi

This file was deleted.

27 changes: 14 additions & 13 deletions pynumaflow/mapper/proto/map_pb2.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

57 changes: 0 additions & 57 deletions pynumaflow/mapstreamer/_udfunction_pb2.pyi

This file was deleted.

Loading

0 comments on commit 71c2b52

Please sign in to comment.