Skip to content

Commit

Permalink
allow replaying videos (#489)
Browse files Browse the repository at this point in the history
lint



added instructions for alternative video ingestion



adjusted launch script



video options

Signed-off-by: maxofir <[email protected]>
  • Loading branch information
maximilianofir authored Oct 9, 2024
1 parent cdfcd83 commit 3f35dd7
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 13 deletions.
13 changes: 13 additions & 0 deletions applications/vila_live/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,19 @@ Once the main LMM-based app is running, you will see a link for the app at `http
sudo ln -s libv4l2.so.0.0.0.0 libv4l2.so.0.0.999999
```

## 📷⚙️ Video Options
There are three options to ingest video data.

1. use a physical device or capture card, such as a v4l2 device as described in the [Setup Instructions](##-⚙️-Setup-Instructions
). Make sure the [vila_live.yaml](./vila_live.yaml) contains the v4l2_source group and specifies the device correctly.
2. convert a video file to a gxf-compatible format using the [convert_video_to_gxf_entities.py](https://github.com/nvidia-holoscan/holoscan-sdk/tree/main/scripts#convert_video_to_gxf_entitiespy) script. See the [yolo_model_deployment](https://github.com/nvidia-holoscan/holohub/tree/main/applications/yolo_model_deployment#step-2-deployment) application for a detailed example. When using the replayer, configure the replayer_source in the yaml file and launch the application with:
```bash
./run_vila_live.sh --source "replayer"
```
3. create a virtual video device, that mounts a video file and replays it, as detailed in the [v4l2_camera examples in holoscan-sdk](https://github.com/nvidia-holoscan/holoscan-sdk/tree/main/examples/v4l2_camera#use-with-v4l2-loopback-devices). This approach may require signing the [v4l2loopback kernel module](https://github.com/umlaeute/v4l2loopback), when using a system with secure-boot enabled. Make sure the vila_live.yaml contains the v4l2_source group and specifies the virtual device correctly. replay the video, using for example:
```bash
ffmpeg -stream_loop -1 -re -i <your_video_path> -pix_fmt yuyv422 -f v4l2 /dev/video3
```
## 🙌 Acknowledgements
- Jetson AI Lab, [Live LLaVA](https://www.jetson-ai-lab.com/tutorial_live-llava.html): for the inspiration to create this app
- [Jetson-Containers](https://github.com/dusty-nv/jetson-containers/tree/master/packages/llm/llamaspeak) repo: For the Flask web-app with WebSockets
Expand Down
2 changes: 1 addition & 1 deletion applications/vila_live/run_vila_live.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ python3 -m tinychat.serve.controller --host 0.0.0.0 --port 10000 & bg_pids+=($!)
python3 -m tinychat.serve.model_worker_new --host 0.0.0.0 --controller http://localhost:10000 --port 40000 --worker http://localhost:40000 \
--model-path /workspace/volumes/models/Llama-3-VILA1.5-8b-AWQ/ \
--quant-path /workspace/volumes/models/Llama-3-VILA1.5-8b-AWQ/llm/llama-3-vila1.5-8b-w4-g128-awq-v2.pt & bg_pids+=($!)
python3 /workspace/holohub/applications/vila_live/vila_live.py & bg_pids+=($!)
python3 /workspace/holohub/applications/vila_live/vila_live.py "$@" & bg_pids+=($!)

# Let the script clean up the server process
wait
73 changes: 62 additions & 11 deletions applications/vila_live/vila_live.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,12 @@

import cupy as cp
from holoscan.core import Application, Operator, OperatorSpec
from holoscan.operators import FormatConverterOp, HolovizOp, V4L2VideoCaptureOp
from holoscan.operators import (
FormatConverterOp,
HolovizOp,
V4L2VideoCaptureOp,
VideoStreamReplayerOp,
)
from holoscan.resources import CudaStreamPool, UnboundedAllocator
from PIL import Image
from vlm import VLM
Expand Down Expand Up @@ -83,20 +88,43 @@ def compute(self, op_input, op_output, context):


class V4L2toVLM(Application):
def __init__(self):
def __init__(self, data, source="v4l2", video_device="none"):
"""V4L2 to VLM app"""
super().__init__()
# set name
self.name = "V4L2 to VLM app"
self.source = source

if data == "none":
data = "/workspace/holohub/data/vila_live"

self.sample_data_path = data
self.video_device = video_device

def compose(self):
# V4L2 to capture usb camera input
source = V4L2VideoCaptureOp(
self,
name="source",
allocator=UnboundedAllocator(self, name="pool"),
**self.kwargs("source"),
)
pool = UnboundedAllocator(self, name="pool")

# V4L2 to capture usb camera input or replayer to replay video
if self.source == "v4l2":
v4l2_args = self.kwargs("v4l2_source")
if self.video_device != "none":
v4l2_args["device"] = self.video_device
source = V4L2VideoCaptureOp(
self,
name="v4l2_source",
allocator=pool,
**v4l2_args,
)
source_output = "signal"

elif self.source == "replayer":
source = VideoStreamReplayerOp(
self,
name="replayer_source",
directory=self.sample_data_path,
**self.kwargs("replayer_source"),
)
source_output = "output"

formatter_cuda_stream_pool = CudaStreamPool(
self,
Expand Down Expand Up @@ -142,28 +170,51 @@ def compose(self):
# Initialize the VLM + WebApp operator
web_server = VLMWebAppOp(self, name="VLMWebAppOp")

self.add_flow(source, visualizer, {("signal", "receivers")})
self.add_flow(source, visualizer, {(source_output, "receivers")})
self.add_flow(visualizer, format_converter_vlm, {("render_buffer_output", "source_video")})
self.add_flow(format_converter_vlm, web_server, {("tensor", "video_stream")})


def main():
# Parse args
parser = ArgumentParser(description="VILA live application.")
parser.add_argument(
"-s",
"--source",
choices=["v4l2", "replayer"],
default="v4l2",
help=(
"If 'v4l2', uses the v4l2 device specified in the yaml file or "
" --video_device if specified. "
"If 'replayer', uses video stream replayer."
),
)
parser.add_argument(
"-c",
"--config",
default="none",
help=("Set config path to override the default config file location"),
)
parser.add_argument(
"-d",
"--data",
default="none",
help=("Set the data path"),
)
parser.add_argument(
"-v",
"--video_device",
default="none",
help=("The video device to use. By default the application will use /dev/video0"),
)
args = parser.parse_args()

if args.config == "none":
config_file = os.path.join(os.path.dirname(__file__), "vila_live.yaml")
else:
config_file = args.config

app = V4L2toVLM()
app = V4L2toVLM(args.data, args.source, args.video_device)
app.config(config_file)
app.run()

Expand Down
10 changes: 9 additions & 1 deletion applications/vila_live/vila_live.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.
---
source:
v4l2_source:
device: "/dev/video0"

replayer_source: # VideoStreamReplayer
# directory: "../data/ultrasound/video"
basename: ""
frame_rate: 0 # as specified in timestamps
repeat: true # default: false
realtime: true # default: true
count: 0 # default: 0 (no frame count restriction)

holoviz:
tensors:
- name: ""
Expand Down

0 comments on commit 3f35dd7

Please sign in to comment.