diff --git a/applications/vila_live/README.md b/applications/vila_live/README.md index 020c10dd..4ac221bd 100644 --- a/applications/vila_live/README.md +++ b/applications/vila_live/README.md @@ -61,6 +61,19 @@ Once the main LMM-based app is running, you will see a link for the app at `http sudo ln -s libv4l2.so.0.0.0.0 libv4l2.so.0.0.999999 ``` +## 📷⚙️ Video Options +There are three options to ingest video data. + +1. use a physical device or capture card, such as a v4l2 device as described in the [Setup Instructions](##-⚙️-Setup-Instructions +). Make sure the [vila_live.yaml](./vila_live.yaml) contains the v4l2_source group and specifies the device correctly. +2. convert a video file to a gxf-compatible format using the [convert_video_to_gxf_entities.py](https://github.com/nvidia-holoscan/holoscan-sdk/tree/main/scripts#convert_video_to_gxf_entitiespy) script. See the [yolo_model_deployment](https://github.com/nvidia-holoscan/holohub/tree/main/applications/yolo_model_deployment#step-2-deployment) application for a detailed example. When using the replayer, configure the replayer_source in the yaml file and launch the application with: + ```bash + ./run_vila_live.sh --source "replayer" + ``` +3. create a virtual video device, that mounts a video file and replays it, as detailed in the [v4l2_camera examples in holoscan-sdk](https://github.com/nvidia-holoscan/holoscan-sdk/tree/main/examples/v4l2_camera#use-with-v4l2-loopback-devices). This approach may require signing the [v4l2loopback kernel module](https://github.com/umlaeute/v4l2loopback), when using a system with secure-boot enabled. Make sure the vila_live.yaml contains the v4l2_source group and specifies the virtual device correctly. replay the video, using for example: + ```bash + ffmpeg -stream_loop -1 -re -i -pix_fmt yuyv422 -f v4l2 /dev/video3 + ``` ## 🙌 Acknowledgements - Jetson AI Lab, [Live LLaVA](https://www.jetson-ai-lab.com/tutorial_live-llava.html): for the inspiration to create this app - [Jetson-Containers](https://github.com/dusty-nv/jetson-containers/tree/master/packages/llm/llamaspeak) repo: For the Flask web-app with WebSockets diff --git a/applications/vila_live/run_vila_live.sh b/applications/vila_live/run_vila_live.sh index b4d39f7c..b054c05a 100755 --- a/applications/vila_live/run_vila_live.sh +++ b/applications/vila_live/run_vila_live.sh @@ -41,7 +41,7 @@ python3 -m tinychat.serve.controller --host 0.0.0.0 --port 10000 & bg_pids+=($!) python3 -m tinychat.serve.model_worker_new --host 0.0.0.0 --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 \ --model-path /workspace/volumes/models/Llama-3-VILA1.5-8b-AWQ/ \ --quant-path /workspace/volumes/models/Llama-3-VILA1.5-8b-AWQ/llm/llama-3-vila1.5-8b-w4-g128-awq-v2.pt & bg_pids+=($!) -python3 /workspace/holohub/applications/vila_live/vila_live.py & bg_pids+=($!) +python3 /workspace/holohub/applications/vila_live/vila_live.py "$@" & bg_pids+=($!) # Let the script clean up the server process wait diff --git a/applications/vila_live/vila_live.py b/applications/vila_live/vila_live.py index 332b3eb9..ca659706 100644 --- a/applications/vila_live/vila_live.py +++ b/applications/vila_live/vila_live.py @@ -22,7 +22,12 @@ import cupy as cp from holoscan.core import Application, Operator, OperatorSpec -from holoscan.operators import FormatConverterOp, HolovizOp, V4L2VideoCaptureOp +from holoscan.operators import ( + FormatConverterOp, + HolovizOp, + V4L2VideoCaptureOp, + VideoStreamReplayerOp, +) from holoscan.resources import CudaStreamPool, UnboundedAllocator from PIL import Image from vlm import VLM @@ -83,20 +88,43 @@ def compute(self, op_input, op_output, context): class V4L2toVLM(Application): - def __init__(self): + def __init__(self, data, source="v4l2", video_device="none"): """V4L2 to VLM app""" super().__init__() # set name self.name = "V4L2 to VLM app" + self.source = source + + if data == "none": + data = "/workspace/holohub/data/vila_live" + + self.sample_data_path = data + self.video_device = video_device def compose(self): - # V4L2 to capture usb camera input - source = V4L2VideoCaptureOp( - self, - name="source", - allocator=UnboundedAllocator(self, name="pool"), - **self.kwargs("source"), - ) + pool = UnboundedAllocator(self, name="pool") + + # V4L2 to capture usb camera input or replayer to replay video + if self.source == "v4l2": + v4l2_args = self.kwargs("v4l2_source") + if self.video_device != "none": + v4l2_args["device"] = self.video_device + source = V4L2VideoCaptureOp( + self, + name="v4l2_source", + allocator=pool, + **v4l2_args, + ) + source_output = "signal" + + elif self.source == "replayer": + source = VideoStreamReplayerOp( + self, + name="replayer_source", + directory=self.sample_data_path, + **self.kwargs("replayer_source"), + ) + source_output = "output" formatter_cuda_stream_pool = CudaStreamPool( self, @@ -142,7 +170,7 @@ def compose(self): # Initialize the VLM + WebApp operator web_server = VLMWebAppOp(self, name="VLMWebAppOp") - self.add_flow(source, visualizer, {("signal", "receivers")}) + self.add_flow(source, visualizer, {(source_output, "receivers")}) self.add_flow(visualizer, format_converter_vlm, {("render_buffer_output", "source_video")}) self.add_flow(format_converter_vlm, web_server, {("tensor", "video_stream")}) @@ -150,12 +178,35 @@ def compose(self): def main(): # Parse args parser = ArgumentParser(description="VILA live application.") + parser.add_argument( + "-s", + "--source", + choices=["v4l2", "replayer"], + default="v4l2", + help=( + "If 'v4l2', uses the v4l2 device specified in the yaml file or " + " --video_device if specified. " + "If 'replayer', uses video stream replayer." + ), + ) parser.add_argument( "-c", "--config", default="none", help=("Set config path to override the default config file location"), ) + parser.add_argument( + "-d", + "--data", + default="none", + help=("Set the data path"), + ) + parser.add_argument( + "-v", + "--video_device", + default="none", + help=("The video device to use. By default the application will use /dev/video0"), + ) args = parser.parse_args() if args.config == "none": @@ -163,7 +214,7 @@ def main(): else: config_file = args.config - app = V4L2toVLM() + app = V4L2toVLM(args.data, args.source, args.video_device) app.config(config_file) app.run() diff --git a/applications/vila_live/vila_live.yaml b/applications/vila_live/vila_live.yaml index 21faf921..21038636 100644 --- a/applications/vila_live/vila_live.yaml +++ b/applications/vila_live/vila_live.yaml @@ -14,9 +14,17 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -source: +v4l2_source: device: "/dev/video0" +replayer_source: # VideoStreamReplayer + # directory: "../data/ultrasound/video" + basename: "" + frame_rate: 0 # as specified in timestamps + repeat: true # default: false + realtime: true # default: true + count: 0 # default: 0 (no frame count restriction) + holoviz: tensors: - name: ""