diff --git a/depthai_sdk/docs/source/conf.py b/depthai_sdk/docs/source/conf.py
index 3e2765f71..2407ed557 100644
--- a/depthai_sdk/docs/source/conf.py
+++ b/depthai_sdk/docs/source/conf.py
@@ -22,7 +22,7 @@
 author = 'Luxonis'
 
 # The full version, including alpha/beta/rc tags
-release = '1.11.0'
+release = '1.12.0'
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/depthai_sdk/examples/CameraComponent/camera_control_with_nn.py b/depthai_sdk/examples/CameraComponent/camera_control_with_nn.py
new file mode 100644
index 000000000..f8b228afd
--- /dev/null
+++ b/depthai_sdk/examples/CameraComponent/camera_control_with_nn.py
@@ -0,0 +1,10 @@
+from depthai_sdk import OakCamera
+
+with OakCamera() as oak:
+    color = oak.create_camera('color')
+    face_det = oak.create_nn('face-detection-retail-0004', color)
+    # Control the camera's exposure/focus based on the (largest) detected face
+    color.control_with_nn(face_det, auto_focus=True, auto_exposure=True, debug=False)
+
+    oak.visualize(face_det, fps=True)
+    oak.start(blocking=True)
diff --git a/depthai_sdk/examples/StereoComponent/stereo_encoded.py b/depthai_sdk/examples/StereoComponent/stereo_encoded.py
index 0b5660f09..8ca1eb471 100644
--- a/depthai_sdk/examples/StereoComponent/stereo_encoded.py
+++ b/depthai_sdk/examples/StereoComponent/stereo_encoded.py
@@ -6,7 +6,7 @@
     stereo = oak.create_stereo('800p', fps=30, encode='h264')
 
     # Set on-device output colorization, works only for encoded output
-    stereo.set_colormap(dai.Colormap.STEREO_JET)
+    stereo.set_colormap(dai.Colormap.JET)
 
     oak.visualize(stereo.out.encoded, fps=True)
     oak.start(blocking=True)
diff --git a/depthai_sdk/examples/trigger_action/custom_trigger.py b/depthai_sdk/examples/trigger_action/custom_trigger.py
index 735173805..c69abbedd 100644
--- a/depthai_sdk/examples/trigger_action/custom_trigger.py
+++ b/depthai_sdk/examples/trigger_action/custom_trigger.py
@@ -27,7 +27,7 @@ def my_condition(packet) -> bool:
 with OakCamera() as oak:
     color = oak.create_camera('color', fps=30)
     stereo = oak.create_stereo('800p')
-    stereo.config_stereo(align='color')
+    stereo.config_stereo(align=color)
 
     trigger = Trigger(input=stereo.out.depth, condition=my_condition, cooldown=30)
     action = RecordAction(
diff --git a/depthai_sdk/requirements.txt b/depthai_sdk/requirements.txt
index d1a5f96fa..162165241 100644
--- a/depthai_sdk/requirements.txt
+++ b/depthai_sdk/requirements.txt
@@ -4,7 +4,7 @@ opencv-contrib-python>4
 blobconverter>=1.4.1
 pytube>=12.1.0
 --extra-index-url https://artifacts.luxonis.com/artifactory/luxonis-python-snapshot-local/
-depthai==2.21.2
+depthai==2.22.0
 PyTurboJPEG==1.6.4
 marshmallow==3.17.0
 xmltodict
diff --git a/depthai_sdk/sdk_tests/components/nn/test_nn_component.py b/depthai_sdk/sdk_tests/components/nn/test_nn_component.py
index 5e95f5556..e5156d092 100644
--- a/depthai_sdk/sdk_tests/components/nn/test_nn_component.py
+++ b/depthai_sdk/sdk_tests/components/nn/test_nn_component.py
@@ -91,3 +91,15 @@ def callback(packet):
             if not oak_camera.poll():
                 raise RuntimeError('Polling failed')
             time.sleep(0.1)
+
+
+def test_encoded_output():
+    with OakCamera() as oak_camera:
+        camera = oak_camera.create_camera('color', '1080p', encode='h264')
+
+        oak_camera.callback(camera.out.encoded, lambda x: print(x))
+        oak_camera.start(blocking=False)
+
+        for i in range(10):
+            oak_camera.poll()
+            time.sleep(0.1)
diff --git a/depthai_sdk/sdk_tests/components/stereo/test_stereo_component.py b/depthai_sdk/sdk_tests/components/stereo/test_stereo_component.py
index 2b7bf91c3..f0ef137de 100644
--- a/depthai_sdk/sdk_tests/components/stereo/test_stereo_component.py
+++ b/depthai_sdk/sdk_tests/components/stereo/test_stereo_component.py
@@ -1,8 +1,9 @@
 import time
 
+import depthai as dai
 import pytest
+
 from depthai_sdk.oak_camera import OakCamera
-import depthai as dai
 
 
 def test_stereo_output():
@@ -10,10 +11,11 @@ def test_stereo_output():
         if dai.CameraBoardSocket.LEFT not in oak_camera.sensors:
             pytest.skip('Looks like camera does not have mono pair, skipping...')
         else:
-            stereo = oak_camera.create_stereo('400p')
+            stereo = oak_camera.create_stereo('800p', encode='h264')
 
             oak_camera.callback([stereo.out.depth, stereo.out.disparity,
-                                 stereo.out.rectified_left, stereo.out.rectified_right], callback=lambda x: None)
+                                 stereo.out.rectified_left, stereo.out.rectified_right,
+                                 stereo.out.encoded], callback=lambda x: None)
             oak_camera.start(blocking=False)
 
             for i in range(10):
diff --git a/depthai_sdk/sdk_tests/test_examples.py b/depthai_sdk/sdk_tests/test_examples.py
index 9bb6946db..636c018be 100644
--- a/depthai_sdk/sdk_tests/test_examples.py
+++ b/depthai_sdk/sdk_tests/test_examples.py
@@ -1,27 +1,38 @@
+import os
 import subprocess
 import sys
 import time
 from pathlib import Path
 
 import cv2
+import pytest
 
-EXAMPLES_DIR = Path(__file__).parent.parent.parent / "examples"
+EXAMPLES_DIR = Path(__file__).parents[1] / 'examples'
 
+# Create a temporary directory for the tests
+Path('/tmp/depthai_sdk_tests').mkdir(exist_ok=True)
+os.chdir('/tmp/depthai_sdk_tests')
 
-def test_examples():
-    python_executable = Path(sys.executable)
-    for example in EXAMPLES_DIR.rglob("**/*.py"):
-        print(f"Running example: {example.name}")
-
-        result = subprocess.Popen(f"{python_executable} {example}", stdout=subprocess.PIPE, stderr=subprocess.PIPE,
-                                  env={"DISPLAY": ""}, shell=True)
-
-        time.sleep(5)
-        result.kill()
-        time.sleep(5)
-        print('Stderr: ', result.stderr.read().decode())
 
-        # if result.returncode and result.returncode != 0:
-        #     assert False, f"{example} raised an exception: {result.stderr}"
-
-        cv2.destroyAllWindows()
+@pytest.mark.parametrize('example', list(EXAMPLES_DIR.rglob("**/*.py")))
+def test_examples(example):
+    print(f"Running {example}")
+    python_executable = Path(sys.executable)
+    result = subprocess.Popen(f"{python_executable} {example}",
+                              stdout=subprocess.PIPE,
+                              stderr=subprocess.PIPE,
+                              env={
+                                  'DISPLAY': '',
+                                  'PYTHONPATH': f'{os.environ["PYTHONPATH"]}:{EXAMPLES_DIR.parent}'
+                              },
+                              shell=True)
+
+    time.sleep(5)
+    result.kill()
+    time.sleep(5)
+    print('Stderr: ', result.stderr.read().decode())
+
+    if result.returncode and result.returncode != 0:
+        assert False, f"{example} raised an exception: {result.stderr}"
+
+    cv2.destroyAllWindows()
diff --git a/depthai_sdk/setup.py b/depthai_sdk/setup.py
index 415048309..d78ea7f06 100644
--- a/depthai_sdk/setup.py
+++ b/depthai_sdk/setup.py
@@ -9,7 +9,7 @@
 
 setup(
     name='depthai-sdk',
-    version='1.11.0',
+    version='1.12.0',
     description='This package provides an abstraction of the DepthAI API library.',
     long_description=io.open("README.md", encoding="utf-8").read(),
     long_description_content_type="text/markdown",
@@ -30,7 +30,8 @@
         "replay": ['mcap>=0.0.10',
                    'mcap-ros1-support==0.0.8',
                    'rosbags==0.9.11'],
-        "record": ['av']
+        "record": ['av'],
+        "test": ['pytest']
     },
     project_urls={
         "Bug Tracker": "https://github.com/luxonis/depthai/issues",
diff --git a/depthai_sdk/src/depthai_sdk/__init__.py b/depthai_sdk/src/depthai_sdk/__init__.py
index c3fbab8ae..abfe2341c 100644
--- a/depthai_sdk/src/depthai_sdk/__init__.py
+++ b/depthai_sdk/src/depthai_sdk/__init__.py
@@ -1,5 +1,6 @@
 from depthai_sdk.args_parser import ArgsParser
 from depthai_sdk.classes.enum import ResizeMode
+from depthai_sdk.constants import CV2_HAS_GUI_SUPPORT
 from depthai_sdk.logger import set_logging_level
 from depthai_sdk.oak_camera import OakCamera
 from depthai_sdk.oak_device import OakDevice
@@ -10,7 +11,7 @@
 from depthai_sdk.utils import _create_config, get_config_field
 from depthai_sdk.visualize import *
 
-__version__ = '1.11.0'
+__version__ = '1.12.0'
 
 
 def __import_sentry(sentry_dsn: str) -> None:
diff --git a/depthai_sdk/src/depthai_sdk/classes/output_config.py b/depthai_sdk/src/depthai_sdk/classes/output_config.py
index 5551109a3..77a6144a9 100644
--- a/depthai_sdk/src/depthai_sdk/classes/output_config.py
+++ b/depthai_sdk/src/depthai_sdk/classes/output_config.py
@@ -16,10 +16,21 @@
 from depthai_sdk.trigger_action.triggers.abstract_trigger import Trigger
 from depthai_sdk.visualize.visualizer import Visualizer
 
+def find_new_name(name: str, names: List[str]):
+    while True:
+        arr = name.split(' ')
+        num = arr[-1]
+        if num.isnumeric():
+            arr[-1] = str(int(num) + 1)
+            name = " ".join(arr)
+        else:
+            name = f"{name} 2"
+        if name not in names:
+            return name
 
 class BaseConfig:
     @abstractmethod
-    def setup(self, pipeline: dai.Pipeline, device, names: List[str]) -> List[XoutBase]:
+    def setup(self, pipeline: dai.Pipeline, device: dai.Device, names: List[str]) -> List[XoutBase]:
         raise NotImplementedError()
 
 
@@ -39,24 +50,12 @@ def __init__(self, output: Callable,
         self.visualizer_enabled = visualizer_enabled
         self.record_path = record_path
 
-    def find_new_name(self, name: str, names: List[str]):
-        while True:
-            arr = name.split(' ')
-            num = arr[-1]
-            if num.isnumeric():
-                arr[-1] = str(int(num) + 1)
-                name = " ".join(arr)
-            else:
-                name = f"{name} 2"
-            if name not in names:
-                return name
-
     def setup(self, pipeline: dai.Pipeline, device, names: List[str]) -> List[XoutBase]:
         xoutbase: XoutBase = self.output(pipeline, device)
         xoutbase.setup_base(self.callback)
 
         if xoutbase.name in names:  # Stream name already exist, append a number to it
-            xoutbase.name = self.find_new_name(xoutbase.name, names)
+            xoutbase.name = find_new_name(xoutbase.name, names)
         names.append(xoutbase.name)
 
         recorder = None
diff --git a/depthai_sdk/src/depthai_sdk/classes/packets.py b/depthai_sdk/src/depthai_sdk/classes/packets.py
index d53d61bcd..0fa36c904 100644
--- a/depthai_sdk/src/depthai_sdk/classes/packets.py
+++ b/depthai_sdk/src/depthai_sdk/classes/packets.py
@@ -39,6 +39,7 @@ class _TrackingDetection(_Detection):
 class _TwoStageDetection(_Detection):
     nn_data: dai.NNData
 
+
 class NNDataPacket:
     """
     Contains only dai.NNData message
@@ -50,6 +51,7 @@ def __init__(self, name: str, nn_data: dai.NNData):
         self.name = name
         self.msg = nn_data
 
+
 class FramePacket:
     """
     Contains only dai.ImgFrame message and cv2 frame, which is used by visualization logic.
@@ -80,6 +82,7 @@ def __init__(self,
         self.color_frame = color_frame
         self.visualizer = visualizer
 
+
 class DepthPacket(FramePacket):
     mono_frame: dai.ImgFrame
 
diff --git a/depthai_sdk/src/depthai_sdk/components/camera_component.py b/depthai_sdk/src/depthai_sdk/components/camera_component.py
index f3aaf679e..4122c9375 100644
--- a/depthai_sdk/src/depthai_sdk/components/camera_component.py
+++ b/depthai_sdk/src/depthai_sdk/components/camera_component.py
@@ -46,6 +46,7 @@ def __init__(self,
         """
         super().__init__()
         self.out = self.Out(self)
+        self._pipeline = pipeline
 
         self.node: Optional[Union[dai.node.ColorCamera, dai.node.MonoCamera, dai.node.XLinkIn]] = None
         self.encoder: Optional[dai.node.VideoEncoder] = None
@@ -279,6 +280,40 @@ def _config_camera_args(self, args: Dict):
         else:  # Replay
             self.config_camera(fps=args.get('fps', None))
 
+    def control_with_nn(self, detection_component: 'NNComponent', auto_focus=True, auto_exposure=True, debug=False):
+        """
+        Control the camera AF/AE/AWB based on the object detection results.
+
+        :param detection_component: NNComponent that will be used to control the camera
+        :param auto_focus: Enable auto focus to the object
+        :param auto_exposure: Enable auto exposure to the object
+        :param auto_white_balance: auto white balance to the object
+        """
+
+        if not auto_focus and not auto_exposure:
+            logging.error(
+                'Attempted to control camera with NN, but both Auto-Focus and Auto-Exposure were disabled! Attempt ignored.'
+            )
+            return
+        if 'NNComponent' not in str(type(detection_component)):
+            raise ValueError('nn_component must be an instance of NNComponent!')
+        if not detection_component._is_detector():
+            raise ValueError('nn_component must be a object detection model (YOLO/MobileNetSSD based)!')
+
+        from depthai_sdk.components.control_camera_with_nn import control_camera_with_nn
+
+        control_camera_with_nn(
+            pipeline=self._pipeline,
+            camera_control=self.node.inputControl,
+            nn_output=detection_component.node.out,
+            resize_mode=detection_component._ar_resize_mode,
+            resolution=self.node.getResolution(),
+            nn_size = detection_component._size,
+            af=auto_focus,
+            ae=auto_exposure,
+            debug=debug
+        )
+
     def config_color_camera(self,
                             interleaved: Optional[bool] = None,
                             color_order: Union[None, dai.ColorCameraProperties.ColorOrder, str] = None,
diff --git a/depthai_sdk/src/depthai_sdk/components/camera_helper.py b/depthai_sdk/src/depthai_sdk/components/camera_helper.py
index 6fe6482c1..2fc802319 100644
--- a/depthai_sdk/src/depthai_sdk/components/camera_helper.py
+++ b/depthai_sdk/src/depthai_sdk/components/camera_helper.py
@@ -179,6 +179,18 @@ def get_res(resolutions: Dict[Any, Tuple[int,int]]):
     else:
         raise Exception('Camera sensor type unknown!', type)
 
+def get_resolution_size(
+        resolution: Union[
+            dai.ColorCameraProperties.SensorResolution,
+            dai.MonoCameraProperties.SensorResolution
+            ]) -> Tuple[int,int]:
+    if resolution in colorResolutions:
+        return colorResolutions[resolution]
+    elif resolution in monoResolutions:
+        return monoResolutions[resolution]
+    else:
+        raise Exception('Camera sensor resolution unknown!', resolution)
+
 def getClosesResolution(sensor: dai.CameraFeatures,
                         type: dai.CameraSensorType,
                         width: Optional[int] = None,
diff --git a/depthai_sdk/src/depthai_sdk/components/control_camera_with_nn.py b/depthai_sdk/src/depthai_sdk/components/control_camera_with_nn.py
new file mode 100644
index 000000000..f804f4974
--- /dev/null
+++ b/depthai_sdk/src/depthai_sdk/components/control_camera_with_nn.py
@@ -0,0 +1,69 @@
+import depthai as dai
+from depthai_sdk.classes.enum import ResizeMode
+from typing import Union, Tuple
+from depthai_sdk.components.camera_helper import get_resolution_size
+from pathlib import Path
+from string import Template
+
+def control_camera_with_nn(
+        pipeline: dai.Pipeline,
+        camera_control: dai.Node.Input,
+        nn_output: dai.Node.Output,
+        resize_mode: ResizeMode,
+        resolution: Union[dai.ColorCameraProperties.SensorResolution, dai.MonoCameraProperties.SensorResolution],
+        nn_size: Tuple[int, int],
+        af: bool,
+        ae: bool,
+        debug: bool
+        ):
+    sensor_resolution = get_resolution_size(resolution)
+    # width / height (old ar)
+    sensor_ar = sensor_resolution[0] / sensor_resolution[1]
+    # NN ar (new ar)
+    nn_ar = nn_size[0] / nn_size[1]
+
+    if resize_mode == ResizeMode.LETTERBOX:
+        padding = (sensor_ar - nn_ar) / 2
+        if padding > 0:
+            init = f"xmin = 0; ymin = {-padding}; xmax = 1; ymax = {1 + padding}"
+        else:
+            init = f"xmin = {padding}; ymin = 0; xmax = {1 - padding}; ymax = 1"
+    elif resize_mode in [ResizeMode.CROP, ResizeMode.FULL_CROP]:
+        cropping = (1 - (nn_ar / sensor_ar)) / 2
+        if cropping < 0:
+            init = f"xmin = 0; ymin = {-cropping}; xmax = 1; ymax = {1 + cropping}"
+        else:
+            init = f"xmin = {cropping}; ymin = 0; xmax = {1 - cropping}; ymax = 1"
+    else: # Stretch
+        init = f"xmin=0; ymin=0; xmax=1; ymax=1"
+
+
+    resize_str = f"new_xmin=xmin+width*det.xmin; new_ymin=ymin+height*det.ymin; new_xmax=xmin+width*det.xmax; new_ymax=ymin+height*det.ymax;"
+    denormalize = f"startx=int(new_xmin*{sensor_resolution[0]}); starty=int(new_ymin*{sensor_resolution[1]}); new_width=int((new_xmax-new_xmin)*{sensor_resolution[0]}); new_height=int((new_ymax-new_ymin)*{sensor_resolution[1]});"
+    control_str = ''
+    if ae:
+        control_str += f"control.setAutoExposureRegion(startx, starty, new_width, new_height);"
+    if af:
+        control_str += f"control.setAutoFocusRegion(startx, starty, new_width, new_height);"
+
+
+    script_node = pipeline.create(dai.node.Script)
+    script_node.setProcessor(dai.ProcessorType.LEON_CSS)  # More stable
+
+    with open(Path(__file__).parent / 'template_control_cam_with_nn.py', 'r') as file:
+        code = Template(file.read()).substitute(
+            DEBUG='' if debug else '#',
+            INIT=init,
+            RESIZE=resize_str,
+            DENORMALIZE=denormalize,
+            CONTROL=control_str
+        )
+        script_node.setScript(code)
+
+        if debug:
+            print(code)
+
+    # Node linking:
+    # NN output -> Script -> Camera input
+    nn_output.link(script_node.inputs['detections'])
+    script_node.outputs['control'].link(camera_control)
diff --git a/depthai_sdk/src/depthai_sdk/components/multi_stage_nn.py b/depthai_sdk/src/depthai_sdk/components/multi_stage_nn.py
index 167cb007b..183bea93c 100644
--- a/depthai_sdk/src/depthai_sdk/components/multi_stage_nn.py
+++ b/depthai_sdk/src/depthai_sdk/components/multi_stage_nn.py
@@ -2,7 +2,7 @@
 from pathlib import Path
 from string import Template
 from typing import Tuple, Optional, List
-
+from depthai_sdk.classes.enum import ResizeMode
 import depthai as dai
 
 from depthai_sdk.types import GenericNeuralNetwork
@@ -14,6 +14,9 @@ def __init__(self,
                  detection_node: GenericNeuralNetwork,  # Object detection node
                  high_res_frames: dai.Node.Output,
                  size: Tuple[int, int],
+                 frame_size: Tuple[int, int],
+                 det_nn_size: Tuple[int, int],
+                 resize_mode: ResizeMode,
                  num_frames_pool: int = 20,
                  ) -> None:
         """
@@ -21,10 +24,30 @@ def __init__(self,
             pipeline (dai.Pipeline): Pipeline object
             detection_node (GenericNeuralNetwork): Object detection NN
             high_res_frames (dai.Node.Output): Frames corresponding to the detection NN
-            size (Tuple[int, int]): Size of the frames.
+            size (Tuple[int, int]): NN input size of the second stage NN
+            frame_size (Tuple[int, int]): Frame size of the first (detection) stage NN
+            det_nn_size (Tuple[int, int]): NN input size of the first (detection) stage NN
+            resize_mode (ResizeMode): Resize mode that was used to resize frame for first (detection) stage
             debug (bool, optional): Enable debug mode. Defaults to False.
             num_frames_pool (int, optional): Number of frames to keep in the pool. Defaults to 20.
         """
+        frame_size_ar = frame_size[0] / frame_size[1]
+        det_nn_size_ar = det_nn_size[0] / det_nn_size[1]
+        if resize_mode == ResizeMode.LETTERBOX:
+            padding = (frame_size_ar - det_nn_size_ar) / 2
+            if padding > 0:
+                self.init = f"xmin = 0; ymin = {-padding}; xmax = 1; ymax = {1 + padding}"
+            else:
+                self.init = f"xmin = {padding}; ymin = 0; xmax = {1 - padding}; ymax = 1"
+        elif resize_mode in [ResizeMode.CROP, ResizeMode.FULL_CROP]:
+            cropping = (1 - (det_nn_size_ar / frame_size_ar)) / 2
+            if cropping < 0:
+                self.init = f"xmin = 0; ymin = {-cropping}; xmax = 1; ymax = {1 + cropping}"
+            else:
+                self.init = f"xmin = {cropping}; ymin = 0; xmax = {1 - cropping}; ymax = 1"
+        else: # Stretch
+            self.init = f"xmin=0; ymin=0; xmax=1; ymax=1"
+
         self.script: dai.node.Script = pipeline.create(dai.node.Script)
         self.script.setProcessor(dai.ProcessorType.LEON_CSS)  # More stable
         self._size: Tuple[int, int] = size
@@ -61,6 +84,7 @@ def configure(self,
                 CHECK_LABELS=f"if det.label not in {str(whitelist_labels)}: continue" if whitelist_labels else "",
                 WIDTH=str(self._size[0]),
                 HEIGHT=str(self._size[1]),
+                INIT=self.init,
                 SCALE_BB_XMIN=f"-{scale_bb[0] / 100}" if scale_bb else "",  # % to float value
                 SCALE_BB_YMIN=f"-{scale_bb[1] / 100}" if scale_bb else "",
                 SCALE_BB_XMAX=f"+{scale_bb[0] / 100}" if scale_bb else "",
diff --git a/depthai_sdk/src/depthai_sdk/components/nn_component.py b/depthai_sdk/src/depthai_sdk/components/nn_component.py
index 8994a2d5e..5126d9f46 100644
--- a/depthai_sdk/src/depthai_sdk/components/nn_component.py
+++ b/depthai_sdk/src/depthai_sdk/components/nn_component.py
@@ -158,29 +158,24 @@ def __init__(self,
             # Link ImageManip output to NN node
             self.image_manip.out.link(self.node.input)
         elif self._is_multi_stage():
-            # Calculate crop shape of the object detector.
-            # TODO: in the future also support stretching and letterboxing resize modes
-            frame_size = self._input._input.stream_size
-            nn_size = self._input._size
-            scale = frame_size[0] / nn_size[0], frame_size[1] / nn_size[1]
-            i = 0 if scale[0] < scale[1] else 1
-            crop = int(scale[i] * nn_size[0]), int(scale[i] * nn_size[1])
-
             # Here, ImageManip will only crop the high-res frame to correct aspect ratio
             # (without resizing!) and it also acts as a buffer (by default, its pool size is set to 20).
             self.image_manip = pipeline.createImageManip()
             self.image_manip.setNumFramesPool(20)
             self._input._stream_input.link(self.image_manip.inputImage)
+            frame_full_size = self._input._input.stream_size
 
             if self._input._is_detector():
-                self.image_manip.setResize(*crop)
-                self.image_manip.setMaxOutputFrameSize(crop[0] * crop[1] * 3)
+                self.image_manip.setMaxOutputFrameSize(frame_full_size[0] * frame_full_size[1] * 3)
 
                 # Create script node, get HQ frames from input.
                 self._multi_stage_nn = MultiStageNN(pipeline=pipeline,
                                                     detection_node=self._input.node,
                                                     high_res_frames=self.image_manip.out,
                                                     size=self._size,
+                                                    frame_size=frame_full_size,
+                                                    det_nn_size=self._input._size,
+                                                    resize_mode=self._input._ar_resize_mode,
                                                     num_frames_pool=20)
                 self._multi_stage_nn.out.link(self.node.input)  # Cropped frames
 
@@ -197,7 +192,7 @@ def __init__(self,
                 # TODO pass frame on device, and just send config from host
                 self.x_in = pipeline.createXLinkIn()
                 self.x_in.setStreamName("input_queue")
-                self.x_in.setMaxDataSize(frame_size[0] * frame_size[1] * 3)
+                self.x_in.setMaxDataSize(frame_full_size[0] * frame_full_size[1] * 3)
                 self.x_in.out.link(self.image_manip.inputImage)
 
                 self.x_in_cfg = pipeline.createXLinkIn()
@@ -387,7 +382,7 @@ def _change_resize_mode(self, mode: ResizeMode) -> None:
             mode (ResizeMode): Resize mode to use
         """
         if self._is_multi_stage():
-            return # We need high-res frames for multi-stage NN, so we can crop them later
+            return  # We need high-res frames for multi-stage NN, so we can crop them later
 
         self._ar_resize_mode = mode
 
@@ -454,7 +449,7 @@ def _parse_label(self, label: Union[str, int]) -> int:
 
     def config_tracker(self,
                        tracker_type: Optional[dai.TrackerType] = None,
-                       track_labels: Optional[List[int]] = None,
+                       track_labels: Optional[List[Union[int, str]]] = None,
                        assignment_policy: Optional[dai.TrackerIdAssignmentPolicy] = None,
                        max_obj: Optional[int] = None,
                        threshold: Optional[float] = None,
@@ -618,6 +613,12 @@ def main(self, pipeline: dai.Pipeline, device: dai.Device) -> XoutBase:
             Default output. Streams NN results and high-res frames that were downscaled and used for inferencing.
             Produces DetectionPacket or TwoStagePacket (if it's 2. stage NNComponent).
             """
+            if self._comp._is_multi_stage():
+                input_nn = self._comp._input
+                if input_nn._input.encoder:
+                    return self.encoded(pipeline=pipeline, device=device)
+            elif self._comp._input.encoder:
+                return self.encoded(pipeline=pipeline, device=device)
 
             if self._comp._is_multi_stage():
                 det_nn_out = StreamXout(id=self._comp._input.node.id,
@@ -744,12 +745,33 @@ def encoded(self, pipeline: dai.Pipeline, device: dai.Device) -> XoutNnResults:
             Streams NN results and encoded frames (frames used for inferencing)
             Produces DetectionPacket or TwoStagePacket (if it's 2. stage NNComponent).
             """
+            if self._comp._is_multi_stage():
+                input_nn = self._comp._input
+
+                if input_nn._input.encoder is None:
+                    raise Exception('Encoder not enabled for the input')
+
+                det_nn_out = StreamXout(id=self._comp._input.node.id,
+                                        out=self._comp._input.node.out,
+                                        name=self._comp._input.name)
+                frames = StreamXout(id=input_nn._input.encoder.id,
+                                    out=input_nn._input.encoder.bitstream,
+                                    name=self._comp.name)
+                second_nn_out = StreamXout(self._comp.node.id, self._comp.node.out, name=self._comp.name)
+
+                out = XoutTwoStage(det_nn=self._comp._input,
+                                   second_nn=self._comp,
+                                   frames=frames,
+                                   det_out=det_nn_out,
+                                   second_nn_out=second_nn_out,
+                                   device=device,
+                                   input_queue_name="input_queue" if self._comp.x_in else None)
+
+                return self._comp._create_xout(pipeline, out)
+
             if self._comp._input.encoder is None:
                 raise Exception('Encoder not enabled for the input')
 
-            if self._comp._is_multi_stage():
-                raise NotImplementedError('Encoded output not supported for 2-stage NNs at the moment.')
-
             if self._comp._input._encoder_profile == dai.VideoEncoderProperties.Profile.MJPEG:
                 out = XoutNnMjpeg(
                     det_nn=self._comp,
diff --git a/depthai_sdk/src/depthai_sdk/components/stereo_component.py b/depthai_sdk/src/depthai_sdk/components/stereo_component.py
index 1c813b6d8..9f6f7608d 100644
--- a/depthai_sdk/src/depthai_sdk/components/stereo_component.py
+++ b/depthai_sdk/src/depthai_sdk/components/stereo_component.py
@@ -1,7 +1,7 @@
 import logging
 import warnings
 from enum import Enum
-from typing import Optional, Union, Any, Dict, Tuple
+from typing import Optional, Union, Any, Dict, Tuple, List
 
 import cv2
 import depthai as dai
@@ -123,13 +123,23 @@ def __init__(self,
             # If not specified, default to 400P resolution for faster processing
             self._resolution = self._resolution or dai.MonoCameraProperties.SensorResolution.THE_400_P
 
+            # Always use 1200p for OAK-D-LR and OAK-D-SR
+            if self._device.getDeviceName() == 'OAK-D-LR':
+                self._resolution = dai.MonoCameraProperties.SensorResolution.THE_1200_P
+
             if not self.left:
                 self.left = CameraComponent(device, pipeline, 'left', self._resolution, self._fps, replay=self._replay)
             if not self.right:
                 self.right = CameraComponent(device, pipeline, 'right', self._resolution, self._fps,
                                              replay=self._replay)
 
-            if 0 < len(device.getIrDrivers()):
+            # AR0234 outputs 1200p, so we need to resize it to 800p on RVC2
+            if self._device.getDeviceName() == 'OAK-D-LR':
+                if isinstance(self.left, CameraComponent) and isinstance(self.right, CameraComponent):
+                    self.left.config_color_camera(isp_scale=(2, 3))
+                    self.right.config_color_camera(isp_scale=(2, 3))
+
+            if self._get_ir_drivers():
                 laser = self._args.get('irDotBrightness', None)
                 laser = laser if laser is not None else 800
                 if 0 < laser:
@@ -347,19 +357,28 @@ def set_auto_ir(self, auto_mode: bool, continuous_mode: bool = False) -> None:
             auto_mode: Enable/disable auto IR.
             continuous_mode: Enable/disable continious mode.
         """
-        warnings.warn('Auto IR is an experimental feature, which may not work as expected. '
-                      'Please report any issues at https://discuss.luxonis.com/t/support/.')
-        self.ir_settings = {
-            'auto_mode': auto_mode,
-            'continuous_mode': continuous_mode
-        }
-
-    def set_ir(self, dot_projector_brightness: int, flood_brightness: int):
+        if self._get_ir_drivers():
+            self.ir_settings = {
+                'auto_mode': auto_mode,
+                'continuous_mode': continuous_mode
+            }
+            self.set_ir(0, 0)
+
+    def set_ir(self, dot_projector_brightness: int = None, flood_brightness: int = None):
         """
         Sets IR brightness and flood.
         """
-        self._device.setIrLaserDotProjectorBrightness(dot_projector_brightness)
-        self._device.setIrFloodLightBrightness(flood_brightness)
+        if self._get_ir_drivers():
+            if dot_projector_brightness is not None:
+                self._device.setIrLaserDotProjectorBrightness(dot_projector_brightness)
+            if flood_brightness is not None:
+                self._device.setIrFloodLightBrightness(flood_brightness)
+
+    def _get_ir_drivers(self) -> List[Tuple[str, int, int]]:
+        """
+        Returns a list of IR drivers.
+        """
+        return self._device.getIrDrivers()
 
     def _get_disparity_factor(self, device: dai.Device) -> float:
         """
diff --git a/depthai_sdk/src/depthai_sdk/components/template_control_cam_with_nn.py b/depthai_sdk/src/depthai_sdk/components/template_control_cam_with_nn.py
new file mode 100644
index 000000000..e153ff9cd
--- /dev/null
+++ b/depthai_sdk/src/depthai_sdk/components/template_control_cam_with_nn.py
@@ -0,0 +1,27 @@
+${INIT}
+width=xmax-xmin; height=ymax-ymin;
+while True:
+    detections = node.io['detections'].get()
+    if len(detections.detections) == 0:
+        continue
+    largest_det = detections.detections[0]
+    largest_size = 0
+    for det in detections.detections:
+        size = (det.xmax - det.xmin) * (det.ymax - det.ymin)
+        if size > largest_size:
+            largest_size = size
+            largest_det = det
+    det = largest_det
+    ${DEBUG}node.warn(f"Detected ({det.xmin}, {det.ymin}) ({det.xmax}, {det.ymax})")
+    ${RESIZE}
+    if new_xmin < 0: new_xmin = 0.001
+    if new_ymin < 0: new_ymin = 0.001
+    if new_xmax > 1: new_xmax = 0.999
+    if new_ymax > 1: new_ymax = 0.999
+
+    ${DEBUG}node.warn(f"New ({new_xmin}, {new_ymin}) ({new_xmax}, {new_ymax})")
+    ${DENORMALIZE}
+    ${DEBUG}node.warn(f"Denormalized START ({startx}, {starty}) Width: {new_width}, height: {new_height})")
+    control = CameraControl(1)
+    ${CONTROL}
+    node.io['control'].send(control)
\ No newline at end of file
diff --git a/depthai_sdk/src/depthai_sdk/components/template_multi_stage_script.py b/depthai_sdk/src/depthai_sdk/components/template_multi_stage_script.py
index 0c812446c..3e0da3772 100644
--- a/depthai_sdk/src/depthai_sdk/components/template_multi_stage_script.py
+++ b/depthai_sdk/src/depthai_sdk/components/template_multi_stage_script.py
@@ -1,6 +1,10 @@
 import time
 msgs = dict()
 cntr = 0
+
+${INIT}
+width=xmax-xmin; height=ymax-ymin;
+
 def add_msg(msg, name, seq = None):
     global msgs
     if seq is None:
@@ -32,10 +36,7 @@ def get_msgs():
     return None
 
 def correct_bb(xmin,ymin,xmax,ymax):
-    if xmin < 0: xmin = 0.001
-    if ymin < 0: ymin = 0.001
-    if xmax > 1: xmax = 0.999
-    if ymax > 1: ymax = 0.999
+    
     return [xmin,ymin,xmax,ymax]
 
 while True:
@@ -64,14 +65,19 @@ def correct_bb(xmin,ymin,xmax,ymax):
         for i, det in enumerate(dets.detections):
             ${CHECK_LABELS}
             cfg = ImageManipConfig()
-            bb = correct_bb(
-                det.xmin${SCALE_BB_XMIN},
-                det.ymin${SCALE_BB_YMIN},
-                det.xmax${SCALE_BB_XMAX},
-                det.ymax${SCALE_BB_YMAX},
-                )
-            cfg.setCropRect(*bb)
-            ${DEBUG}node.warn(f"Sending {i + 1}. det. Det {det.xmin}, {det.ymin}, {det.xmax}, {det.ymax}")
+            # Change to fit the full-frame
+            new_xmin=xmin+width*det.xmin
+            new_ymin=ymin+height*det.ymin
+            new_xmax=xmin+width*det.xmax
+            new_ymax=ymin+height*det.ymax
+
+            if new_xmin < 0: new_xmin = 0.001
+            if new_ymin < 0: new_ymin = 0.001
+            if new_xmax > 1: new_xmax = 0.999
+            if new_ymax > 1: new_ymax = 0.999
+
+            cfg.setCropRect(new_xmin, new_ymin, new_xmax, new_ymax)
+            ${DEBUG}node.warn(f"Sending {i + 1}. det. Det {new_xmin}, {new_ymin}, {new_xmax}, {new_ymax}")
             cfg.setResize(${WIDTH}, ${HEIGHT})
             cfg.setKeepAspectRatio(False)
             node.io['manip_cfg'].send(cfg)
diff --git a/depthai_sdk/src/depthai_sdk/constants.py b/depthai_sdk/src/depthai_sdk/constants.py
new file mode 100644
index 000000000..b0c8c3c02
--- /dev/null
+++ b/depthai_sdk/src/depthai_sdk/constants.py
@@ -0,0 +1,15 @@
+CV2_HAS_GUI_SUPPORT = False
+
+try:
+    import cv2
+    import re
+
+    build_info = cv2.getBuildInformation()
+    gui_support_regex = re.compile(r'GUI: +([A-Z]+)')
+    gui_support_match = gui_support_regex.search(build_info)
+    if gui_support_match:
+        gui_support = gui_support_match.group(1)
+        if gui_support.upper() != 'NONE':
+            CV2_HAS_GUI_SUPPORT = True
+except ImportError:
+    pass
diff --git a/depthai_sdk/src/depthai_sdk/oak_camera.py b/depthai_sdk/src/depthai_sdk/oak_camera.py
index 321eb3d10..a701d194f 100644
--- a/depthai_sdk/src/depthai_sdk/oak_camera.py
+++ b/depthai_sdk/src/depthai_sdk/oak_camera.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 from typing import Dict, Any, Optional, List, Union, Callable
 
+from depthai_sdk import CV2_HAS_GUI_SUPPORT
 from depthai_sdk.visualize.visualizer import Visualizer
 
 try:
@@ -399,7 +400,7 @@ def poll(self) -> Optional[int]:
 
         Returns: key pressed from cv2.waitKey, or None if
         """
-        if cv2:
+        if CV2_HAS_GUI_SUPPORT:
             key = cv2.waitKey(1)
             if key == ord('q'):
                 self._stop = True
diff --git a/depthai_sdk/src/depthai_sdk/oak_outputs/syncing.py b/depthai_sdk/src/depthai_sdk/oak_outputs/syncing.py
index 9c1255706..addcac847 100644
--- a/depthai_sdk/src/depthai_sdk/oak_outputs/syncing.py
+++ b/depthai_sdk/src/depthai_sdk/oak_outputs/syncing.py
@@ -1,3 +1,4 @@
+import threading
 from typing import Dict, List, Any, Optional
 
 
@@ -21,23 +22,28 @@ class SequenceNumSync:
     def __init__(self, stream_num: int):
         self.msgs: Dict[str, Dict[str, Any]] = dict()
         self.stream_num: int = stream_num
+        self.lock = threading.Lock()
 
     def sync(self, seq_num: int, name: str, msg) -> Optional[Dict]:
         seq_num = str(seq_num)
-        if seq_num not in self.msgs: self.msgs[seq_num] = dict()
 
-        self.msgs[seq_num][name] = (msg)
+        with self.lock:
+            if seq_num not in self.msgs:
+                self.msgs[seq_num] = dict()
 
-        if self.stream_num == len(self.msgs[seq_num]):
-            # We have sequence num synced frames!
-            ret = self.msgs[seq_num]
+            self.msgs[seq_num][name] = (msg)
 
-            # Remove previous msgs
-            new_msgs = {}
-            for name, msg in self.msgs.items():
-                if int(name) > int(seq_num):
-                    new_msgs[name] = msg
-            self.msgs = new_msgs
+            if self.stream_num == len(self.msgs[seq_num]):
+                # We have sequence num synced frames!
+                ret = self.msgs[seq_num]
+
+                # Remove previous msgs
+                new_msgs = {}
+                for name, msg in self.msgs.items():
+                    if int(name) > int(seq_num):
+                        new_msgs[name] = msg
+                self.msgs = new_msgs
+
+                return ret
 
-            return ret
         return None
diff --git a/depthai_sdk/src/depthai_sdk/oak_outputs/xout/xout_disparity.py b/depthai_sdk/src/depthai_sdk/oak_outputs/xout/xout_disparity.py
index 6856ff8dc..e6e8239aa 100644
--- a/depthai_sdk/src/depthai_sdk/oak_outputs/xout/xout_disparity.py
+++ b/depthai_sdk/src/depthai_sdk/oak_outputs/xout/xout_disparity.py
@@ -90,13 +90,17 @@ def on_callback(self, packet) -> None:
     def visualize(self, packet: DepthPacket):
         frame = packet.frame
         disparity_frame = (frame * self.multiplier).astype(np.uint8)
+        try:
+            mono_frame = packet.mono_frame.getCvFrame()
+        except AttributeError:
+            mono_frame = None
 
         stereo_config = self._visualizer.config.stereo
 
         if self.use_wls_filter or stereo_config.wls_filter:
             self.wls_filter.setLambda(self.wls_lambda or stereo_config.wls_lambda)
             self.wls_filter.setSigmaColor(self.wls_sigma or stereo_config.wls_sigma)
-            disparity_frame = self.wls_filter.filter(disparity_frame, packet.mono_frame.getCvFrame())
+            disparity_frame = self.wls_filter.filter(disparity_frame, mono_frame)
 
         colorize = self.colorize or stereo_config.colorize
         if self.colormap is not None:
@@ -105,13 +109,16 @@ def visualize(self, packet: DepthPacket):
             colormap = stereo_config.colormap
             colormap[0] = [0, 0, 0]  # Invalidate pixels 0 to be black
 
+        if mono_frame is not None and disparity_frame.ndim == 2 and mono_frame.ndim == 3:
+            disparity_frame = disparity_frame[..., np.newaxis]
+
         if colorize == StereoColor.GRAY:
             packet.frame = disparity_frame
         elif colorize == StereoColor.RGB:
             packet.frame = cv2.applyColorMap(disparity_frame, colormap)
         elif colorize == StereoColor.RGBD:
             packet.frame = cv2.applyColorMap(
-                (disparity_frame * 0.5 + packet.mono_frame.getCvFrame() * 0.5).astype(np.uint8), colormap
+                (disparity_frame * 1.0 + mono_frame * 0.5).astype(np.uint8), colormap
             )
 
         if self._visualizer.config.output.clickable:
diff --git a/depthai_sdk/src/depthai_sdk/oak_outputs/xout/xout_nn.py b/depthai_sdk/src/depthai_sdk/oak_outputs/xout/xout_nn.py
index 3e3561338..64641493c 100644
--- a/depthai_sdk/src/depthai_sdk/oak_outputs/xout/xout_nn.py
+++ b/depthai_sdk/src/depthai_sdk/oak_outputs/xout/xout_nn.py
@@ -1,3 +1,4 @@
+import threading
 from typing import List, Union, Dict, Any, Optional, Tuple
 
 import depthai as dai
@@ -293,6 +294,8 @@ def __init__(self,
         self.input_queue = None
         self.input_cfg_queue = None
 
+        self.lock = threading.Lock()
+
     def xstreams(self) -> List[StreamXout]:
         return [self.frames, self.nn_results, self.second_nn_out]
 
@@ -398,11 +401,12 @@ def new_msg(self, name: str, msg: dai.Buffer) -> None:
             #     if int(s) <= int(seq):
             #         del self.msgs[s]
 
-            new_msgs = {}
-            for name, msg in self.msgs.items():
-                if int(name) > int(seq):
-                    new_msgs[name] = msg
-            self.msgs = new_msgs
+            with self.lock:
+                new_msgs = {}
+                for name, msg in self.msgs.items():
+                    if int(name) > int(seq):
+                        new_msgs[name] = msg
+                self.msgs = new_msgs
 
     def add_detections(self, seq: str, dets: dai.ImgDetections):
         # Used to match the scaled bounding boxes by the 2-stage NN script node
diff --git a/depthai_sdk/src/depthai_sdk/readers/videocap_reader.py b/depthai_sdk/src/depthai_sdk/readers/videocap_reader.py
index b637a01bd..725729ce5 100644
--- a/depthai_sdk/src/depthai_sdk/readers/videocap_reader.py
+++ b/depthai_sdk/src/depthai_sdk/readers/videocap_reader.py
@@ -28,7 +28,8 @@ def __init__(self, path: Path, loop: bool = False) -> None:
         self._is_looped = loop
 
         if path.is_file():
-            self.videos[path.stem.lower()] = {
+            stream_name = path.stem if (path.stem in ['left', 'right']) else 'color'
+            self.videos[stream_name] = {
                 'reader': cv2.VideoCapture(str(path)),
                 'socket': dai.CameraBoardSocket.CAM_A
             }
diff --git a/depthai_sdk/src/depthai_sdk/recorders/video_recorder.py b/depthai_sdk/src/depthai_sdk/recorders/video_recorder.py
index d6f2b7330..0582736bd 100644
--- a/depthai_sdk/src/depthai_sdk/recorders/video_recorder.py
+++ b/depthai_sdk/src/depthai_sdk/recorders/video_recorder.py
@@ -50,7 +50,7 @@ def update(self, path: Path, device: dai.Device, xouts: List['XoutFrames']):
             else:
                 try:
                     from .video_writers.av_writer import AvWriter
-                    self._writers[xout_name] = AvWriter(self.path, xout_name, fourcc, xout.fps)
+                    self._writers[xout_name] = AvWriter(self.path, xout_name, fourcc, xout.fps, xout._frame_shape)
                 except Exception as e:
                     # TODO here can be other errors, not only import error
                     logging.warning(f'Exception while creating AvWriter: {e}.'
diff --git a/depthai_sdk/src/depthai_sdk/recorders/video_writers/av_writer.py b/depthai_sdk/src/depthai_sdk/recorders/video_writers/av_writer.py
index caec89b19..62ed798aa 100644
--- a/depthai_sdk/src/depthai_sdk/recorders/video_writers/av_writer.py
+++ b/depthai_sdk/src/depthai_sdk/recorders/video_writers/av_writer.py
@@ -1,48 +1,128 @@
-from datetime import timedelta
+import os
 from fractions import Fraction
 from pathlib import Path
+from typing import Tuple, Union
 
 import depthai as dai
+import numpy as np
 
-from depthai_sdk.recorders.video_writers import BaseWriter
 from depthai_sdk.recorders.video_writers.utils import create_writer_dir
+from .base_writer import BaseWriter
+
+START_CODE_PREFIX = b"\x00\x00\x01"
+KEYFRAME_NAL_TYPE = 5
+NAL_TYPE_BITS = 5  # nal unit type is encoded in lower 5 bits
+
+
+def is_keyframe(encoded_frame: np.array) -> bool:
+    """
+    Check if encoded frame is a keyframe.
+    Args:
+        encoded_frame: Encoded frame.
+
+    Returns:
+        True if encoded frame is a keyframe, False otherwise.
+    """
+    byte_stream = bytearray(encoded_frame)
+    size = len(byte_stream)
+
+    pos = 0
+    while pos < size:
+        retpos = byte_stream.find(START_CODE_PREFIX, pos)
+        if retpos == -1:
+            return False
+
+        # Skip start code
+        pos = retpos + 3
+
+        # Extract the first 5 bits
+        type_ = byte_stream[pos] >> 3
+
+        if type_ == KEYFRAME_NAL_TYPE:
+            return True
+
+    return False
 
 
 class AvWriter(BaseWriter):
-    def __init__(self, path: Path, name: str, fourcc: str, fps: float):
+    def __init__(self, path: Path, name: str, fourcc: str, fps: float, frame_shape: Tuple[int, int]):
+        """
+        Args:
+            path: Path to the folder where the file will be created.
+            name: Name of the file without extension.
+            fourcc: Stream codec.
+            fps: Frames per second of the stream.
+            frame_shape: Width and height of the frames.
+        """
         super().__init__(path, name)
 
         self.start_ts = None
+        self.frame_shape = frame_shape
+
         self._fps = fps
         self._fourcc = fourcc
+        self._stream = None
+
+    def _create_stream(self, fourcc: str, fps: float) -> None:
+        """
+        Create stream in file with given fourcc and fps, works in-place.
 
-    def _create_stream(self, fourcc, fps) -> None:
-        """Create stream in file with given fourcc and fps, works in-place."""
-        stream = self._file.add_stream(fourcc, rate=int(fps))
-        stream.time_base = Fraction(1, 1000 * 1000)  # Microseconds
+        Args:
+            fourcc: Stream codec.
+            fps: Frames per second of the stream.
+        """
+        self._stream = self._file.add_stream(fourcc, rate=int(fps))
+        self._stream.time_base = Fraction(1, 1000 * 1000)  # Microseconds
 
         # We need to set pixel format for MJEPG, for H264/H265 it's yuv420p by default
         if fourcc == 'mjpeg':
-            stream.pix_fmt = 'yuvj420p'
+            self._stream.pix_fmt = 'yuvj420p'
 
-    def create_file_for_buffer(self, subfolder: str, buf_name: str):  # independent of type of frames
+        if self.frame_shape is not None:
+            self._stream.width = self.frame_shape[0]
+            self._stream.height = self.frame_shape[1]
+
+    def create_file_for_buffer(self, subfolder: str, buf_name: str) -> None:  # independent of type of frames
         self.create_file(subfolder)
 
-    def create_file(self, subfolder: str):
+    def create_file(self, subfolder: str) -> None:
+        """
+        Create file for writing frames.
+
+        Args:
+            subfolder: Subfolder relative to the main folder where the file will be created.
+        """
         path_to_file = create_writer_dir(self.path / subfolder, self.name, 'mp4')
         self._create_file(path_to_file)
 
-    def _create_file(self, path_to_file: str):
+    def _create_file(self, path_to_file: str) -> None:
+        """
+        Create av container for writing frames.
+
+        Args:
+            path_to_file: Path to the file.
+        """
         global av
-        import av as av
-        self._file = av.open(path_to_file, 'w')
+        import av
+        self._file = av.open(str(Path(path_to_file).with_suffix(f'.{self._fourcc}')), 'w')
         self._create_stream(self._fourcc, self._fps)
 
     def write(self, frame: dai.ImgFrame) -> None:
+        """
+        Write packet bytes to h264 file.
+
+        Args:
+            frame: ImgFrame from depthai pipeline.
+        """
         if self._file is None:
             self.create_file(subfolder='')
 
-        packet = av.Packet(frame.getData())  # Create new packet with byte array
+        frame_data = frame.getData()
+
+        if self.start_ts is None and not is_keyframe(frame_data):
+            return
+
+        packet = av.Packet(frame_data)  # Create new packet with byte array
 
         # Set frame timestamp
         if self.start_ts is None:
@@ -51,9 +131,50 @@ def write(self, frame: dai.ImgFrame) -> None:
         ts = int((frame.getTimestampDevice() - self.start_ts).total_seconds() * 1e6)  # To microsec
         packet.dts = ts
         packet.pts = ts
-
         self._file.mux_one(packet)  # Mux the Packet into container
 
-    def close(self):
+    def close(self) -> None:
+        """
+        Close the file and remux it to mp4.
+        """
         if self._file is not None:
+            p = self._stream.encode(None)
+            self._file.mux(p)
             self._file.close()
+
+        # Remux the stream to finalize the output file
+        self.remux_video(str(self._file.name))
+
+    def remux_video(self, input_file: Union[Path, str]) -> None:
+        """
+        Remuxes h264 file to mp4.
+
+        Args:
+            input_file: path to h264 file.
+        """
+
+        mp4_file = str(Path(input_file).with_suffix('.mp4'))
+
+        if input_file == mp4_file:
+            mp4_file = str(Path(input_file).with_suffix('.remuxed.mp4'))
+
+        with av.open(mp4_file, "w", format="mp4") as output_container, \
+                av.open(input_file, "r", format=self._fourcc) as input_container:
+            input_stream = input_container.streams[0]
+            output_stream = output_container.add_stream(template=input_stream, rate=self._fps)
+
+            if self.frame_shape:
+                output_stream.width = self.frame_shape[0]
+                output_stream.height = self.frame_shape[1]
+
+            frame_time = (1 / self._fps) * input_stream.time_base.denominator
+            for i, packet in enumerate(input_container.demux(input_stream)):
+                packet.dts = i * frame_time
+                packet.pts = i * frame_time
+                packet.stream = output_stream
+                output_container.mux_one(packet)
+
+        os.remove(input_file)
+
+        if Path(mp4_file).suffix == '.remuxed.mp4':
+            os.rename(mp4_file, input_file)
diff --git a/depthai_sdk/src/depthai_sdk/trigger_action/actions/record_action.py b/depthai_sdk/src/depthai_sdk/trigger_action/actions/record_action.py
index e78452d86..e249fe33d 100644
--- a/depthai_sdk/src/depthai_sdk/trigger_action/actions/record_action.py
+++ b/depthai_sdk/src/depthai_sdk/trigger_action/actions/record_action.py
@@ -96,7 +96,7 @@ def _run(self):
 
             # Call on_finish_callback if it is specified
             if self.on_finish_callback is not None:
-                self.on_finish_callback(self.path)
+                self.on_finish_callback(str(self.path / subfolder))
 
     def activate(self):
         # Setup for the current recording
diff --git a/depthai_sdk/src/depthai_sdk/utils.py b/depthai_sdk/src/depthai_sdk/utils.py
index f2b835496..516642637 100644
--- a/depthai_sdk/src/depthai_sdk/utils.py
+++ b/depthai_sdk/src/depthai_sdk/utils.py
@@ -429,7 +429,7 @@ def _create_config() -> None:
     config_file = Path.home().joinpath('.depthai_sdk', 'config.json')
     default_config = {
         'sentry': True,
-        'sentry_dsn': 'https://981545d5effd480d883f3ff0b1306e49@o1095304.ingest.sentry.io/4504685274791936'
+        'sentry_dsn': 'https://67bc97fb3ee947bf90d83c892eaf19fe@sentry.luxonis.com/3'
     }
     if not config_file.exists():
         config_file.write_text(json.dumps(default_config))
diff --git a/depthai_sdk/src/depthai_sdk/visualize/configs.py b/depthai_sdk/src/depthai_sdk/visualize/configs.py
index 9d1e43a5c..8ee2b43d6 100644
--- a/depthai_sdk/src/depthai_sdk/visualize/configs.py
+++ b/depthai_sdk/src/depthai_sdk/visualize/configs.py
@@ -1,6 +1,6 @@
 from dataclasses import dataclass, field
 from enum import IntEnum
-from typing import Tuple
+from typing import Tuple, Optional
 import numpy as np
 
 try:
@@ -81,8 +81,10 @@ class TextConfig:
     font_thickness: int = 2
     font_position: TextPosition = TextPosition.TOP_LEFT
 
-    bg_transparency: float = 0.5
-    bg_color: Tuple[int, int, int] = (0, 0, 0)
+    background_color: Optional[Tuple[int, int, int]] = None
+    background_transparency: float = 0.5
+
+    outline_color: Tuple[int, int, int] = (0, 0, 0)
 
     line_type: int = 16  # cv2.LINE_AA
 
diff --git a/depthai_sdk/src/depthai_sdk/visualize/objects.py b/depthai_sdk/src/depthai_sdk/visualize/objects.py
index ba7edf607..8fe4991ee 100644
--- a/depthai_sdk/src/depthai_sdk/visualize/objects.py
+++ b/depthai_sdk/src/depthai_sdk/visualize/objects.py
@@ -296,6 +296,9 @@ def __init__(self,
                  detections: List[Union[ImgDetection, dai.Tracklet]],
                  normalizer: BoundingBox,
                  label_map: List[Tuple[str, Tuple]] = None,
+                 label_color: Tuple[int, int, int] = None,
+                 label_background_color: Tuple[int, int, int] = None,
+                 label_background_transparency: float = None,
                  spatial_points: List[dai.Point3f] = None,
                  is_spatial=False,
                  parent_bbox: Union[np.ndarray, Tuple[int, int, int, int]] = None):
@@ -312,6 +315,9 @@ def __init__(self,
         self.detections = detections
         self.normalizer = normalizer
         self.label_map = label_map
+        self.label_color = label_color
+        self.label_background_color = label_background_color
+        self.label_background_transparency = label_background_transparency
         self.spatial_points = spatial_points
         self.is_spatial = is_spatial
         self.parent_bbox = parent_bbox
@@ -331,7 +337,10 @@ def serialize(self) -> dict:
             'detections': [{
                 'bbox': bbox.to_tuple(frame_shape=self.frame_shape) if isinstance(bbox, BoundingBox) else bbox,
                 'label': label,
-                'color': color
+                'color': color,
+                'label_color': self.label_color,
+                'label_background_color': self.label_background_color,
+                'label_background_transparency': self.label_background_transparency
             } for bbox, label, color in list(self.get_detections())]
         }
         if len(self._children) > 0:
@@ -385,12 +394,17 @@ def prepare(self) -> 'VisDetections':
 
                 # Add spatial coordinates
                 self.add_child(VisText(f'{spatial_coords.x}\n{spatial_coords.y}\n{spatial_coords.z}',
+                                       color=self.label_color or color,
                                        bbox=normalized_bbox,
-                                       position=TextPosition.BOTTOM_RIGHT))
+                                       position=TextPosition.BOTTOM_RIGHT,
+                                       background_color=self.label_background_color,
+                                       background_transparency=self.label_background_transparency))
 
             if cv2 and not detection_config.hide_label and len(label) > 0:
                 # Place label in the bounding box
-                self.add_child(VisText(text=label.capitalize(), bbox=normalized_bbox,
+                self.add_child(VisText(text=label.capitalize(),
+                                       color=self.label_color or color,
+                                       bbox=normalized_bbox,
                                        position=detection_config.label_position,
                                        padding=detection_config.label_padding))
 
@@ -438,6 +452,8 @@ def __init__(self,
                  color: Tuple[int, int, int] = None,
                  thickness: int = None,
                  outline: bool = True,
+                 background_color: Tuple[int, int, int] = None,
+                 background_transparency: float = 0.5,
                  bbox: Union[np.ndarray, Tuple[int, int, int, int], BoundingBox] = None,
                  position: TextPosition = TextPosition.TOP_LEFT,
                  padding: int = 10):
@@ -456,6 +472,8 @@ def __init__(self,
             color: Text color.
             thickness: Font thickness.
             outline: Enable outline if set to True, disable otherwise.
+            background_color: Background color.
+            background_transparency: Background transparency.
             bbox: Bounding box where to place text.
             position: Position w.r.t. to frame (or bbox if is set).
             padding: Padding.
@@ -467,6 +485,8 @@ def __init__(self,
         self.color = color
         self.thickness = thickness
         self.outline = outline
+        self.background_color = background_color
+        self.background_transparency = background_transparency
         self.bbox = bbox
         self.position = position
         self.padding = padding
@@ -479,6 +499,8 @@ def serialize(self):
             'color': self.color,
             'thickness': self.thickness,
             'outline': self.outline,
+            'background_color': self.background_color,
+            'background_transparency': self.background_transparency
         }
 
     def prepare(self) -> 'VisText':
@@ -505,7 +527,6 @@ def draw(self, frame: np.ndarray) -> None:
 
         # Extract shape of the bbox if exists
         if self.bbox is not None:
-            # shape = self.bbox[2] - self.bbox[0], self.bbox[3] - self.bbox[1]
             tl, br = self.bbox.denormalize(frame.shape)
             shape = br[0] - tl[0], br[1] - tl[1]
         else:
@@ -519,11 +540,28 @@ def draw(self, frame: np.ndarray) -> None:
         font_thickness = max(1, int(font_scale * 2)) \
             if text_config.auto_scale else self.thickness or text_config.font_thickness
 
-        dy = cv2.getTextSize(self.text, text_config.font_face, font_scale, font_thickness)[0][1] + 10
+        dx, dy = cv2.getTextSize(self.text, text_config.font_face, font_scale, font_thickness)[0]
+        dy += 10
 
         for line in self.text.splitlines():
             y = self.coords[1]
 
+            background_color = self.background_color or text_config.background_color
+            background_transparency = self.background_transparency or text_config.background_transparency
+            if background_color is not None:
+                img_with_background = cv2.rectangle(img=frame.copy(),
+                                                    pt1=(self.coords[0], y - dy),
+                                                    pt2=(self.coords[0] + dx, y + 10),
+                                                    color=background_color,
+                                                    thickness=-1)
+                # take transparency into account
+                cv2.addWeighted(src1=img_with_background,
+                                alpha=background_transparency,
+                                src2=frame,
+                                beta=1 - background_transparency,
+                                gamma=0,
+                                dst=frame)
+
             if self.outline:
                 # Background
                 cv2.putText(img=frame,
@@ -531,7 +569,7 @@ def draw(self, frame: np.ndarray) -> None:
                             org=self.coords,
                             fontFace=text_config.font_face,
                             fontScale=font_scale,
-                            color=text_config.bg_color,
+                            color=text_config.outline_color,
                             thickness=font_thickness + 1,
                             lineType=text_config.line_type)
 
diff --git a/depthai_sdk/src/depthai_sdk/visualize/visualizer.py b/depthai_sdk/src/depthai_sdk/visualize/visualizer.py
index b4c257f37..67d214f69 100644
--- a/depthai_sdk/src/depthai_sdk/visualize/visualizer.py
+++ b/depthai_sdk/src/depthai_sdk/visualize/visualizer.py
@@ -71,6 +71,7 @@ def add_bbox(self,
         Args:
             bbox: Bounding box.
             label: Label for the detection.
+            thickness: Bounding box thickness.
             color: Bounding box color (RGB).
             bbox_style: Bounding box style (one of depthai_sdk.visualize.configs.BboxStyle).
 
@@ -90,6 +91,9 @@ def add_detections(self,
                        normalizer: BoundingBox = None,
                        label_map: List[Tuple[str, Tuple]] = None,
                        spatial_points: List[dai.Point3f] = None,
+                       label_color: Tuple[int, int, int] = None,
+                       label_background_color: Tuple[int, int, int] = None,
+                       label_background_transparency: float = None,
                        is_spatial=False,
                        bbox: Union[np.ndarray, Tuple[int, int, int, int]] = None,
                        ) -> 'Visualizer':
@@ -101,8 +105,12 @@ def add_detections(self,
             normalizer: Normalizer object.
             label_map: List of tuples (label, color).
             spatial_points: List of spatial points. None if not spatial.
+            label_color: Color for the label.
+            label_background_color: Color for the label background.
+            label_background_transparency: Transparency for the label background.
             is_spatial: Flag that indicates if the detections are spatial.
             bbox: Bounding box, if there's a detection inside a bounding box.
+
         Returns:
             self
         """
@@ -110,6 +118,9 @@ def add_detections(self,
                                           normalizer=normalizer,
                                           label_map=label_map,
                                           spatial_points=spatial_points,
+                                          label_color=label_color,
+                                          label_background_color=label_background_color,
+                                          label_background_transparency=label_background_transparency,
                                           is_spatial=is_spatial,
                                           parent_bbox=bbox)
         self.add_object(detection_overlay)
@@ -122,6 +133,8 @@ def add_text(self,
                  color: Tuple[int, int, int] = None,
                  thickness: int = None,
                  outline: bool = True,
+                 background_color: Tuple[int, int, int] = None,
+                 background_transparency: float = 0.5,
                  bbox: Union[np.ndarray, Tuple[int, ...], BoundingBox] = None,
                  position: TextPosition = TextPosition.TOP_LEFT,
                  padding: int = 10) -> 'Visualizer':
@@ -135,6 +148,8 @@ def add_text(self,
             color: Color of the text.
             thickness: Thickness of the text.
             outline: Flag that indicates if the text should be outlined.
+            background_color: Background color.
+            background_transparency: Background transparency.
             bbox: Bounding box.
             position: Position.
             padding: Padding.
@@ -148,6 +163,8 @@ def add_text(self,
                                color=color,
                                thickness=thickness,
                                outline=outline,
+                               background_color=background_color,
+                               background_transparency=background_transparency,
                                bbox=bbox,
                                position=position,
                                padding=padding)
@@ -368,8 +385,9 @@ def text(self,
              font_scale: float = None,
              font_thickness: int = None,
              font_position: TextPosition = None,
-             bg_transparency: float = None,
-             bg_color: Tuple[int, int, int] = None,
+             background_transparency: float = None,
+             background_color: Tuple[int, int, int] = None,
+             outline_color: Tuple[int, int, int] = None,
              line_type: int = None,
              auto_scale: bool = None) -> 'Visualizer':
         """
@@ -382,8 +400,9 @@ def text(self,
             font_scale: Font scale.
             font_thickness: Font thickness.
             font_position: Font position.
-            bg_transparency: Text background transparency.
-            bg_color: Text background color.
+            background_transparency: Text background transparency.
+            background_color: Text background color.
+            outline_color: Outline color.
             line_type: Line type (from cv2).
             auto_scale: Flag that indicates if the font scale should be automatically adjusted.