allow replaying videos (#489)

lint added instructions for alternative video ingestion adjusted launch script video options Signed-off-by: maxofir <[email protected]>
nvidia-holoscan · Oct 9, 2024 · 3f35dd7 · 3f35dd7
1 parent cdfcd83
commit 3f35dd7
Show file tree

Hide file tree

Showing 4 changed files with 85 additions and 13 deletions.
diff --git a/applications/vila_live/README.md b/applications/vila_live/README.md
@@ -61,6 +61,19 @@ Once the main LMM-based app is running, you will see a link for the app at `http
   sudo ln -s libv4l2.so.0.0.0.0  libv4l2.so.0.0.999999
   ```
 
+## 📷⚙️ Video Options
+There are three options to ingest video data.
+
+1. use a physical device or capture card, such as a v4l2 device as described in the [Setup Instructions](##-⚙️-Setup-Instructions
+). Make sure the [vila_live.yaml](./vila_live.yaml) contains the v4l2_source group and specifies the device correctly. 
+2. convert a video file to a gxf-compatible format using the [convert_video_to_gxf_entities.py](https://github.com/nvidia-holoscan/holoscan-sdk/tree/main/scripts#convert_video_to_gxf_entitiespy) script. See the [yolo_model_deployment](https://github.com/nvidia-holoscan/holohub/tree/main/applications/yolo_model_deployment#step-2-deployment) application for a detailed example. When using the replayer, configure the replayer_source in the yaml file and launch the application with:
+    ```bash
+    ./run_vila_live.sh --source "replayer"
+    ```
+3. create a virtual video device, that mounts a video file and replays it, as detailed in the [v4l2_camera examples in holoscan-sdk](https://github.com/nvidia-holoscan/holoscan-sdk/tree/main/examples/v4l2_camera#use-with-v4l2-loopback-devices). This approach may require signing the [v4l2loopback kernel module](https://github.com/umlaeute/v4l2loopback), when using a system with secure-boot enabled. Make sure the vila_live.yaml contains the v4l2_source group and specifies the virtual device correctly. replay the video, using for example:
+    ```bash
+    ffmpeg -stream_loop -1 -re -i <your_video_path> -pix_fmt yuyv422 -f v4l2 /dev/video3
+    ```
 ## 🙌 Acknowledgements
 - Jetson AI Lab, [Live LLaVA](https://www.jetson-ai-lab.com/tutorial_live-llava.html): for the inspiration to create this app
 - [Jetson-Containers](https://github.com/dusty-nv/jetson-containers/tree/master/packages/llm/llamaspeak) repo: For the Flask web-app with WebSockets

diff --git a/applications/vila_live/run_vila_live.sh b/applications/vila_live/run_vila_live.sh
@@ -41,7 +41,7 @@ python3 -m tinychat.serve.controller --host 0.0.0.0 --port 10000 & bg_pids+=($!)
 python3 -m tinychat.serve.model_worker_new --host 0.0.0.0 --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 \
     --model-path /workspace/volumes/models/Llama-3-VILA1.5-8b-AWQ/ \
     --quant-path /workspace/volumes/models/Llama-3-VILA1.5-8b-AWQ/llm/llama-3-vila1.5-8b-w4-g128-awq-v2.pt & bg_pids+=($!)
-python3 /workspace/holohub/applications/vila_live/vila_live.py & bg_pids+=($!)
+python3 /workspace/holohub/applications/vila_live/vila_live.py "$@" & bg_pids+=($!)
 
 # Let the script clean up the server process
 wait
diff --git a/applications/vila_live/vila_live.py b/applications/vila_live/vila_live.py
@@ -22,7 +22,12 @@
 
 import cupy as cp
 from holoscan.core import Application, Operator, OperatorSpec
-from holoscan.operators import FormatConverterOp, HolovizOp, V4L2VideoCaptureOp
+from holoscan.operators import (
+    FormatConverterOp,
+    HolovizOp,
+    V4L2VideoCaptureOp,
+    VideoStreamReplayerOp,
+)
 from holoscan.resources import CudaStreamPool, UnboundedAllocator
 from PIL import Image
 from vlm import VLM
@@ -83,20 +88,43 @@ def compute(self, op_input, op_output, context):
 
 
 class V4L2toVLM(Application):
-    def __init__(self):
+    def __init__(self, data, source="v4l2", video_device="none"):
         """V4L2 to VLM app"""
         super().__init__()
         # set name
         self.name = "V4L2 to VLM app"
+        self.source = source
+
+        if data == "none":
+            data = "/workspace/holohub/data/vila_live"
+
+        self.sample_data_path = data
+        self.video_device = video_device
 
     def compose(self):
-        # V4L2 to capture usb camera input
-        source = V4L2VideoCaptureOp(
-            self,
-            name="source",
-            allocator=UnboundedAllocator(self, name="pool"),
-            **self.kwargs("source"),
-        )
+        pool = UnboundedAllocator(self, name="pool")
+
+        # V4L2 to capture usb camera input or replayer to replay video
+        if self.source == "v4l2":
+            v4l2_args = self.kwargs("v4l2_source")
+            if self.video_device != "none":
+                v4l2_args["device"] = self.video_device
+            source = V4L2VideoCaptureOp(
+                self,
+                name="v4l2_source",
+                allocator=pool,
+                **v4l2_args,
+            )
+            source_output = "signal"
+
+        elif self.source == "replayer":
+            source = VideoStreamReplayerOp(
+                self,
+                name="replayer_source",
+                directory=self.sample_data_path,
+                **self.kwargs("replayer_source"),
+            )
+            source_output = "output"
 
         formatter_cuda_stream_pool = CudaStreamPool(
             self,
@@ -142,28 +170,51 @@ def compose(self):
         # Initialize the VLM + WebApp operator
         web_server = VLMWebAppOp(self, name="VLMWebAppOp")
 
-        self.add_flow(source, visualizer, {("signal", "receivers")})
+        self.add_flow(source, visualizer, {(source_output, "receivers")})
         self.add_flow(visualizer, format_converter_vlm, {("render_buffer_output", "source_video")})
         self.add_flow(format_converter_vlm, web_server, {("tensor", "video_stream")})
 
 
 def main():
     # Parse args
     parser = ArgumentParser(description="VILA live application.")
+    parser.add_argument(
+        "-s",
+        "--source",
+        choices=["v4l2", "replayer"],
+        default="v4l2",
+        help=(
+            "If 'v4l2', uses the v4l2 device specified in the yaml file or "
+            " --video_device if specified. "
+            "If 'replayer', uses video stream replayer."
+        ),
+    )
     parser.add_argument(
         "-c",
         "--config",
         default="none",
         help=("Set config path to override the default config file location"),
     )
+    parser.add_argument(
+        "-d",
+        "--data",
+        default="none",
+        help=("Set the data path"),
+    )
+    parser.add_argument(
+        "-v",
+        "--video_device",
+        default="none",
+        help=("The video device to use.  By default the application will use /dev/video0"),
+    )
     args = parser.parse_args()
 
     if args.config == "none":
         config_file = os.path.join(os.path.dirname(__file__), "vila_live.yaml")
     else:
         config_file = args.config
 
-    app = V4L2toVLM()
+    app = V4L2toVLM(args.data, args.source, args.video_device)
     app.config(config_file)
     app.run()
 

diff --git a/applications/vila_live/vila_live.yaml b/applications/vila_live/vila_live.yaml
@@ -14,9 +14,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 ---
-source:
+v4l2_source:
   device: "/dev/video0"
 
+replayer_source:  # VideoStreamReplayer
+  # directory: "../data/ultrasound/video"
+  basename: ""
+  frame_rate: 0 # as specified in timestamps
+  repeat: true # default: false
+  realtime: true # default: true
+  count: 0 # default: 0 (no frame count restriction)
+
 holoviz:
   tensors:
     - name: ""