diff --git a/Cargo.toml b/Cargo.toml
index dc4c097..9312c0e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,7 +23,7 @@ kamadak-exif = { version = "0.5", default-features = false }
 ash = { version = "0.37", default-features = false, features = ["loaded"] }
 
 [target.'cfg(target_os = "macos")'.dependencies]
-metal = { version = "0.27", default-features = false }
+metal = { version = "0.27", default-features = false, features = ["link"] }
 
 [profile.release]
 strip = true
diff --git a/src/correlation/gpu/correlation.wgsl b/src/correlation/gpu/correlation.wgsl
deleted file mode 100644
index e6ffc61..0000000
--- a/src/correlation/gpu/correlation.wgsl
+++ /dev/null
@@ -1,405 +0,0 @@
-struct Parameters
-{
-    img1_width: u32,
-    img1_height: u32,
-    img2_width: u32,
-    img2_height: u32,
-    out_width: u32,
-    out_height: u32,
-    scale: f32,
-    iteration_pass: u32,
-    fundamental_matrix: mat3x3<f32>,
-    corridor_offset: i32,
-    corridor_start: u32,
-    corridor_end: u32,
-    kernel_size: u32,
-    threshold: f32,
-    min_stdev: f32,
-    neighbor_distance: u32,
-    extend_range: f32,
-    min_range: f32,
-};
-
-var<push_constant> parameters: Parameters;
-// Array of image data: [img1, img2]
-@group(0) @binding(0) var<storage> images: array<f32>;
-// For searchdata: contains [min_corridor, stdev] for image1
-// For cross_correlate: contains [avg, stdev] for image1
-@group(0) @binding(1) var<storage, read_write> internals_img1: array<vec2<f32>>;
-// Contains [avg, stdev] for image 2
-@group(0) @binding(2) var<storage, read_write> internals_img2: array<vec2<f32>>;
-// Contains [min, max, neighbor_count] for the corridor range
-@group(0) @binding(3) var<storage, read_write> internals_int: array<vec3<i32>>;
-@group(0) @binding(4) var<storage, read_write> result_matches: array<vec2<i32>>;
-@group(0) @binding(5) var<storage, read_write> result_corr: array<f32>;
-
-@compute @workgroup_size(16, 16, 1)
-fn init_out_data(@builtin(global_invocation_id) global_id: vec3<u32>) {
-    let x = global_id.x;
-    let y = global_id.y;
-
-    let out_width = parameters.out_width;
-    let out_height = parameters.out_height;
-
-    if x < out_width && y < out_height {
-        result_matches[out_width*y+x] = vec2(-1, -1);
-        result_corr[out_width*y+x] = -1.0;
-    }
-}
-
-@compute @workgroup_size(16, 16, 1)
-fn prepare_initialdata_searchdata(@builtin(global_invocation_id) global_id: vec3<u32>) {
-    let x = global_id.x;
-    let y = global_id.y;
-
-    let out_width = parameters.out_width;
-    let out_height = parameters.out_height;
-    let img1_width = parameters.img1_width;
-    let img1_height = parameters.img1_height;
-
-    if x < img1_width && y < img1_height {
-        internals_img1[img1_width*y+x] = vec2(0.0, 0.0);
-        internals_int[img1_width*y+x] = vec3(-1, -1, 0);
-    }
-    if x < out_width && y < out_height {
-        result_corr[out_width*y+x] = -1.0;
-    }
-}
-
-@compute @workgroup_size(16, 16, 1)
-fn prepare_initialdata_correlation(@builtin(global_invocation_id) global_id: vec3<u32>) {
-    let x = global_id.x;
-    let y = global_id.y;
-
-    let img1_width = parameters.img1_width;
-    let img1_height = parameters.img1_height;
-    let img2_width = parameters.img2_width;
-    let img2_height = parameters.img2_height;
-    let kernel_size = parameters.kernel_size;
-    let kernel_width = kernel_size*2u+1u;
-    let kernel_point_count = f32(kernel_width*kernel_width);
-
-    let img1_offset = 0u;
-    let img2_offset = img1_width*img1_height;
-
-    if x >= kernel_size && x < img1_width-kernel_size && y >= kernel_size && y < img1_height-kernel_size {
-        var avg = 0.0;
-        var stdev = 0.0;
-
-        for (var j=0u;j<kernel_width;j++) {
-            for (var i=0u;i<kernel_width;i++) {
-                let value = images[img1_offset + (y+j-kernel_size)*img1_width+(x+i-kernel_size)];
-                avg += value;
-            }
-        }
-        avg /= kernel_point_count;
-
-        for (var j=0u;j<kernel_width;j++) {
-            for (var i=0u;i<kernel_width;i++) {
-                let value = images[img1_offset + (y+j-kernel_size)*img1_width+(x+i-kernel_size)];
-                let delta = value-avg;
-                stdev += delta*delta;
-            }
-        }
-        stdev = sqrt(stdev/kernel_point_count);
-        internals_img1[img1_width*y+x] = vec2(avg, stdev);
-    } else if x < img1_width && y < img1_height {
-        internals_img1[img1_width*y+x] = vec2(0.0, -1.0);
-    }
-
-    if x >= kernel_size && x < img2_width-kernel_size && y >= kernel_size && y < img2_height-kernel_size {
-        var avg = 0.0;
-        var stdev = 0.0;
-
-        for (var j=0u;j<kernel_width;j++) {
-            for (var i=0u;i<kernel_width;i++) {
-                let value = images[img2_offset + (y+j-kernel_size)*img2_width+(x+i-kernel_size)];
-                avg += value;
-            }
-        }
-        avg /= kernel_point_count;
-
-        for (var j=0u;j<kernel_width;j++) {
-            for (var i=0u;i<kernel_width;i++) {
-                let value = images[img2_offset + (y+j-kernel_size)*img2_width+(x+i-kernel_size)];
-                let delta = value-avg;
-                stdev += delta*delta;
-            }
-        }
-        stdev = sqrt(stdev/kernel_point_count);
-        internals_img2[img2_width*y+x] = vec2(avg, stdev);
-    } else if x < img2_width && y < img2_height {
-        internals_img2[img2_width*y+x] = vec2(0.0, -1.0);
-    }
-}
-
-fn calculate_epipolar_line(x1: u32, y1: u32, coeff: ptr<function,vec2<f32>>, add_const: ptr<function,vec2<f32>>) {
-    let scale = parameters.scale;
-    let p1 = vec3(f32(x1)/scale, f32(y1)/scale, 1.0);
-    let f_p1 = parameters.fundamental_matrix*p1;
-    if abs(f_p1.x)>abs(f_p1.y) {
-        (*coeff).x = f32(-f_p1.y/f_p1.x);
-        (*add_const).x = f32(-scale*f_p1.z/f_p1.x);
-        (*coeff).y = 1.0;
-        (*add_const).y = 0.0;
-    } else {
-        (*coeff).x = 1.0;
-        (*add_const).x = 0.0;
-        (*coeff).y = f32(-f_p1.x/f_p1.y);
-        (*add_const).y = f32(-scale*f_p1.z/f_p1.y);
-    }
-}
-
-@compute @workgroup_size(16, 16, 1)
-fn prepare_searchdata(@builtin(global_invocation_id) global_id: vec3<u32>) {
-    let x = global_id.x;
-    let y = global_id.y;
-    let x_signed = i32(global_id.x);
-    let y_signed = i32(global_id.y);
-
-    let img1_width = parameters.img1_width;
-    let img1_height = parameters.img1_height;
-    let img2_width = parameters.img2_width;
-    let img2_height = parameters.img2_height;
-    let out_width = parameters.out_width;
-    let out_height = parameters.out_height;
-    let scale = parameters.scale;
-    let first_pass = parameters.iteration_pass == 0u;
-    let corridor_start = i32(parameters.corridor_start);
-    let corridor_end = i32(parameters.corridor_end);
-    let kernel_size = parameters.kernel_size;
-    let neighbor_distance = i32(parameters.neighbor_distance);
-    let extend_range = parameters.extend_range;
-    let min_range = parameters.min_range;
-
-    let out_neighbor_width = i32(ceil(f32(parameters.neighbor_distance)/scale))*2+1;
-
-    var start_pos_x = i32(floor(f32(x_signed-neighbor_distance)/scale));
-    var start_pos_y = i32(floor(f32(y_signed-neighbor_distance)/scale));
-
-    var data_int = internals_int[y*img1_width+x];
-    var neighbor_count = data_int[2];
-    if !first_pass && neighbor_count<=0 {
-        return;
-    }
-
-    var data = internals_img1[y*img1_width+x];
-    var mid_corridor = data[0];
-    var range_stdev = data[1];
-    if !first_pass {
-        mid_corridor /= f32(neighbor_count);
-    }
-
-    var coeff: vec2<f32>;
-    var add_const: vec2<f32>;
-    calculate_epipolar_line(x, y, &coeff, &add_const);
-    let corridor_vertical = abs(coeff.y) > abs(coeff.x);
-    for (var i: i32=corridor_start;i<corridor_end;i++) {
-        let x_out = start_pos_x + (i%out_neighbor_width);
-        let y_out = start_pos_y + (i/out_neighbor_width);
-
-        if x_out <=0 || y_out<=0 || x_out>=i32(out_width) || y_out>=i32(out_height) {
-            continue;
-        }
-
-        let coord2 = vec2<f32>(result_matches[u32(y_out)*out_width + u32(x_out)]) * scale;
-        if coord2.x<0.0 || coord2.y<0.0 {
-            continue;
-        }
-
-        var corridor_pos: f32;
-        if corridor_vertical {
-            corridor_pos = round((coord2.y-add_const.y)/coeff.y);
-        } else {
-            corridor_pos = round((coord2.x-add_const.x)/coeff.x);
-        }
-        if first_pass {
-            mid_corridor += corridor_pos;
-            neighbor_count++;
-        } else {
-            let delta = corridor_pos-mid_corridor;
-            range_stdev += delta*delta;
-        }
-    }
-
-    if first_pass {
-        data[0] = mid_corridor;
-        internals_img1[y*img1_width+x] = data;
-        data_int[2] = neighbor_count;
-        internals_int[y*img1_width+x] = data_int;
-        return;
-    }
-
-    data[1] = range_stdev;
-    internals_img1[y*img1_width+x] = data;
-    range_stdev = sqrt(range_stdev/f32(neighbor_count));
-    let corridor_center = i32(round(mid_corridor));
-    let corridor_length = i32(round(min_range+range_stdev*extend_range));
-    var min_pos = i32(kernel_size);
-    var max_pos: i32;
-    if corridor_vertical {
-        max_pos = i32(img2_height-kernel_size);
-    } else {
-        max_pos = i32(img2_width-kernel_size);
-    }
-    min_pos = clamp(corridor_center - corridor_length, min_pos, max_pos);
-    max_pos = clamp(corridor_center + corridor_length, min_pos, max_pos);
-    internals_int[y*img1_width+x] = vec3(min_pos, max_pos, neighbor_count);
-}
-
-@compute @workgroup_size(16, 16, 1)
-fn cross_correlate(@builtin(global_invocation_id) global_id: vec3<u32>) {
-    let x1 = global_id.x;
-    let y1 = global_id.y;
-
-    let img1_width = parameters.img1_width;
-    let img1_height = parameters.img1_height;
-    let img2_width = parameters.img2_width;
-    let img2_height = parameters.img2_height;
-    let out_width = parameters.out_width;
-    let out_height = parameters.out_height;
-    let scale = parameters.scale;
-    let first_iteration = parameters.iteration_pass == 0u;
-    let corridor_offset = parameters.corridor_offset;
-    let kernel_size = parameters.kernel_size;
-    let threshold = parameters.threshold;
-    let min_stdev = parameters.min_stdev;
-    let kernel_width = kernel_size*2u+1u;
-    let kernel_point_count = f32(kernel_width*kernel_width);
-
-    if x1 < kernel_size || x1 >= img1_width-kernel_size || y1 < kernel_size ||  y1 >= img1_height-kernel_size {
-        return;
-    }
-
-    let img1_offset = 0u;
-    let img2_offset = img1_width*img1_height;
-
-    var data_img1 = internals_img1[img1_width*y1+x1];
-    let avg1 = data_img1[0];
-    let stdev1 = data_img1[1];
-    if stdev1 < min_stdev {
-        return;
-    }
-
-    var best_corr = 0.0;
-    var best_match = vec2<i32>(-1, -1);
-
-    var corridor_start = kernel_size + parameters.corridor_start;
-    var corridor_end = kernel_size + parameters.corridor_end;
-    if !first_iteration {
-        let data_int = internals_int[img1_width*y1 + x1];
-        let min_pos_signed = data_int[0];
-        let max_pos_signed = data_int[1];
-        if min_pos_signed<0 || max_pos_signed<0 {
-            return;
-        }
-        let min_pos = u32(min_pos_signed);
-        let max_pos = u32(max_pos_signed);
-        corridor_start = clamp(min_pos+parameters.corridor_start, min_pos, max_pos);
-        corridor_end = clamp(min_pos+parameters.corridor_end, min_pos, max_pos);
-        if corridor_start >= corridor_end {
-            return;
-        }
-    }
-
-    var coeff: vec2<f32>;
-    var add_const: vec2<f32>;
-    calculate_epipolar_line(x1, y1, &coeff, &add_const);
-    var corridor_offset_vector: vec2<i32>;
-    if abs(coeff.x) > abs(coeff.y) {
-        corridor_offset_vector = vec2(0, corridor_offset);
-    } else {
-        corridor_offset_vector = vec2(corridor_offset, 0);
-    }
-    for (var corridor_pos: u32=corridor_start;corridor_pos<corridor_end;corridor_pos++) {
-        let x2_signed = i32(round(coeff.x*f32(corridor_pos)+add_const.x)) + corridor_offset_vector.x;
-        let y2_signed = i32(round(coeff.y*f32(corridor_pos)+add_const.y)) + corridor_offset_vector.y;
-        if x2_signed < 0 || y2_signed < 0 {
-            continue;
-        }
-        let x2 = u32(x2_signed);
-        let y2 = u32(y2_signed);
-        if x2 < kernel_size || x2 >= img2_width-kernel_size || y2 < kernel_size ||  y2 >= img2_height-kernel_size {
-            continue;
-        }
-
-        let data_img2 = internals_img2[img2_width*y2 + x2];
-        let avg2 = data_img2[0];
-        let stdev2 = data_img2[1];
-        if stdev2 < min_stdev {
-            continue;
-        }
-
-        var corr = 0.0;
-        for (var j=0u;j<kernel_width;j++) {
-            for (var i=0u;i<kernel_width;i++) {
-                let delta1 = images[img1_offset + (y1+j-kernel_size)*img1_width+(x1+i-kernel_size)] - avg1;
-                let delta2 = images[img2_offset + (y2+j-kernel_size)*img2_width+(x2+i-kernel_size)] - avg2;
-                corr += delta1*delta2;
-            }
-        }
-        corr = corr/(stdev1*stdev2*kernel_point_count);
-
-        if corr >= threshold && corr > best_corr {
-            best_match.x = i32(round(f32(x2)/scale));
-            best_match.y = i32(round(f32(y2)/scale));
-            best_corr = corr;
-        }
-    }
-
-    let out_pos = out_width*u32((f32(y1)/scale)) + u32(f32(x1)/scale);
-    let current_corr = result_corr[out_pos];
-    if (best_corr >= threshold && best_corr > current_corr)
-    {
-        result_matches[out_pos] = best_match;
-        result_corr[out_pos] = best_corr;
-    }
-}
-
-@group(1) @binding(0) var<storage, read_write> img1: array<vec2<i32>>;
-@group(1) @binding(1) var<storage> img2: array<vec2<i32>>;
-
-@compute @workgroup_size(16, 16, 1)
-fn cross_check_filter(@builtin(global_invocation_id) global_id: vec3<u32>) {
-    let x1 = global_id.x;
-    let y1 = global_id.y;
-
-    let img1_width = parameters.img1_width;
-    let img1_height = parameters.img1_height;
-    let img2_width = parameters.img2_width;
-    let img2_height = parameters.img2_height;
-    let search_area = i32(parameters.neighbor_distance);
-
-    if x1 >= img1_width || y1 >= img1_height {
-        return;
-    }
-
-    let point = img1[img1_width*y1+x1];
-    if point.x < 0 || point.y < 0 || u32(point.x) >= img2_width || u32(point.y) >= img2_height {
-        return;
-    }
-
-    let min_x = u32(clamp(point.x-search_area, 0, i32(img2_width)));
-    let max_x = u32(clamp(point.x+search_area+1, 0, i32(img2_width)));
-    let min_y = u32(clamp(point.y-search_area, 0, i32(img2_height)));
-    let max_y = u32(clamp(point.y+search_area+1, 0, i32(img2_height)));
-
-    let r_min_x = clamp(i32(x1)-search_area, 0, i32(img1_width));
-    let r_max_x = clamp(i32(x1)+search_area+1, 0, i32(img1_width));
-    let r_min_y = clamp(i32(y1)-search_area, 0, i32(img1_height));
-    let r_max_y = clamp(i32(y1)+search_area+1, 0, i32(img1_height));
-
-    for (var y2 = min_y;y2<max_y;y2++) {
-        for (var x2 = min_x;x2<max_x;x2++) {
-            let rpoint = img2[img2_width*y2+x2];
-            if rpoint.x < 0 || rpoint.y < 0 {
-                continue;
-            }
-            if rpoint.x >= r_min_x && rpoint.x < r_max_x && rpoint.y >= r_min_y && rpoint.y < r_max_y {
-                return;
-            }
-        }
-    }
-
-    img1[img1_width*y1+x1] = vec2(-1, -1);
-}
diff --git a/src/correlation/gpu/metal.rs b/src/correlation/gpu/metal.rs
index 1b68c11..d88bea4 100644
--- a/src/correlation/gpu/metal.rs
+++ b/src/correlation/gpu/metal.rs
@@ -1,63 +1,452 @@
+use std::{collections::HashMap, error, ffi::c_void, fs, slice};
+
+use metal::objc::rc::autoreleasepool;
 use nalgebra::Matrix3;
-use std::{error, fmt};
+use rayon::iter::ParallelIterator;
 
-use crate::data::Grid;
+use crate::{
+    correlation::{gpu::ShaderParams, Match},
+    data::{Grid, Point2D},
+};
 
-use super::{CorrelationDirection, ProjectionMode};
+use super::{CorrelationDirection, GpuError, HardwareMode};
 
-pub struct GpuContext {}
+// This is optimized to work with built-in Apple Silicon GPUs.
+// AMD or built-in Intel GPUs will likely underperform.
+// AMD64 devices are no longer available for sale, and testing/validating would require too much
+// effort.
+pub struct Device {
+    device: metal::Device,
+    buffers: Option<DeviceBuffers>,
+    direction: CorrelationDirection,
+    unified_memory: bool,
+    pipelines: HashMap<ShaderModuleType, metal::ComputePipelineState>,
+    command_queue: metal::CommandQueue,
+}
 
-impl GpuContext {
-    pub fn new(
-        _: (usize, usize),
-        _: (usize, usize),
-        _: ProjectionMode,
-        _: Matrix3<f64>,
-        _: bool,
-    ) -> Result<GpuContext, Box<dyn error::Error>> {
-        Err(GpuError::new("Compiled without GPU support").into())
+enum BufferType {
+    GpuOnly,
+    HostVisible,
+}
+
+struct DeviceBuffers {
+    buffer_img: metal::Buffer,
+    buffer_internal_img1: metal::Buffer,
+    buffer_internal_img2: metal::Buffer,
+    buffer_internal_int: metal::Buffer,
+    buffer_out: metal::Buffer,
+    buffer_out_reverse: metal::Buffer,
+    buffer_out_corr: metal::Buffer,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum ShaderModuleType {
+    InitOutData,
+    PrepareInitialdataSearchdata,
+    PrepareInitialdataCorrelation,
+    PrepareSearchdata,
+    CrossCorrelate,
+    CrossCheckFilter,
+}
+
+pub struct DeviceContext {
+    low_power: bool,
+    device: Device,
+}
+
+impl DeviceContext {
+    pub fn new(hardware_mode: HardwareMode) -> Result<DeviceContext, Box<dyn error::Error>> {
+        autoreleasepool(|| {
+            if !matches!(hardware_mode, HardwareMode::Gpu | HardwareMode::GpuLowPower) {
+                return Err(GpuError::new("GPU mode is not enabled").into());
+            };
+            let low_power = matches!(hardware_mode, HardwareMode::GpuLowPower);
+            let device = Device::new()?;
+            Ok(DeviceContext { low_power, device })
+        })
     }
+}
 
-    pub fn get_device_name(&self) -> &'static str {
-        "undefined"
+impl super::DeviceContext<Device> for DeviceContext {
+    fn convert_fundamental_matrix(fundamental_matrix: &Matrix3<f64>) -> [f32; 3 * 4] {
+        let mut f = [0f32; 3 * 4];
+        for row in 0..3 {
+            for col in 0..3 {
+                f[row * 4 + col] = fundamental_matrix[(row, col)] as f32;
+            }
+        }
+        f
     }
 
-    pub fn cross_check_filter(&mut self, _: f32, _: CorrelationDirection) {}
+    fn is_low_power(&self) -> bool {
+        self.low_power
+    }
 
-    pub fn complete_process(
-        &mut self,
-    ) -> Result<Grid<Option<super::Match>>, Box<dyn error::Error>> {
-        Err(GpuError::new("Compiled without GPU support").into())
+    fn get_device_name(&self) -> Option<String> {
+        autoreleasepool(|| Some(self.device.device.name().into()))
     }
 
-    pub fn correlate_images<PL: super::ProgressListener>(
+    fn prepare_device(
         &mut self,
-        _: &Grid<u8>,
-        _: &Grid<u8>,
-        _: f32,
-        _: bool,
-        _: Option<&PL>,
-        _: CorrelationDirection,
+        img1_dimensions: (usize, usize),
+        img2_dimensions: (usize, usize),
     ) -> Result<(), Box<dyn error::Error>> {
-        Err(GpuError::new("Compiled without GPU support").into())
+        let img1_pixels = img1_dimensions.0 * img1_dimensions.1;
+        let img2_pixels = img2_dimensions.0 * img2_dimensions.1;
+
+        autoreleasepool(|| {
+            let device = &mut self.device;
+            device.buffers = None;
+            let buffers = unsafe { device.create_buffers(img1_pixels, img2_pixels)? };
+            device.buffers = Some(buffers);
+            device.direction = CorrelationDirection::Forward;
+            Ok(())
+        })
+    }
+
+    fn device(&self) -> Result<&Device, GpuError> {
+        Ok(&self.device)
+    }
+
+    fn device_mut(&mut self) -> Result<&mut Device, GpuError> {
+        Ok(&mut self.device)
     }
 }
 
-#[derive(Debug)]
-pub struct GpuError {
-    msg: &'static str,
+impl Device {
+    fn new() -> Result<Device, Box<dyn error::Error>> {
+        autoreleasepool(|| {
+            let device = match metal::Device::system_default() {
+                Some(device) => device,
+                None => return Err(GpuError::new("GPU mode is not enabled").into()),
+            };
+            let direction = CorrelationDirection::Forward;
+            let unified_memory = device.has_unified_memory();
+            let pipelines = Device::create_pipelines(&device)?;
+            let command_queue = device.new_command_queue();
+            Ok(Device {
+                device,
+                buffers: None,
+                direction,
+                unified_memory,
+                pipelines,
+                command_queue,
+            })
+        })
+    }
+
+    unsafe fn create_buffers(
+        &self,
+        img1_pixels: usize,
+        img2_pixels: usize,
+    ) -> Result<DeviceBuffers, Box<dyn error::Error>> {
+        let max_pixels = img1_pixels.max(img2_pixels);
+        let buffer_img = self.create_buffer(
+            (img1_pixels + img2_pixels) * std::mem::size_of::<f32>(),
+            BufferType::HostVisible,
+        );
+
+        let buffer_internal_img1 = self.create_buffer(
+            (img1_pixels * 2) * std::mem::size_of::<f32>(),
+            BufferType::GpuOnly,
+        );
+
+        let buffer_internal_img2 = self.create_buffer(
+            (img2_pixels * 2) * std::mem::size_of::<f32>(),
+            BufferType::GpuOnly,
+        );
+
+        let buffer_internal_int = self.create_buffer(
+            max_pixels * 4 * std::mem::size_of::<i32>(),
+            BufferType::GpuOnly,
+        );
+
+        let buffer_out = self.create_buffer(
+            img1_pixels * 2 * std::mem::size_of::<i32>(),
+            BufferType::HostVisible,
+        );
+
+        let buffer_out_reverse = self.create_buffer(
+            img2_pixels * 2 * std::mem::size_of::<i32>(),
+            BufferType::GpuOnly,
+        );
+
+        let buffer_out_corr = self.create_buffer(
+            max_pixels * std::mem::size_of::<f32>(),
+            BufferType::HostVisible,
+        );
+
+        Ok(DeviceBuffers {
+            buffer_img,
+            buffer_internal_img1,
+            buffer_internal_img2,
+            buffer_internal_int,
+            buffer_out,
+            buffer_out_reverse,
+            buffer_out_corr,
+        })
+    }
+
+    fn buffers(&self) -> Result<&DeviceBuffers, GpuError> {
+        match self.buffers.as_ref() {
+            Some(buffers) => Ok(buffers),
+            None => Err(GpuError::new("Buffers not initialized")),
+        }
+    }
+
+    unsafe fn create_buffer(&self, size: usize, buffer_type: BufferType) -> metal::Buffer {
+        let size = size as u64;
+        let options = match buffer_type {
+            BufferType::GpuOnly => metal::MTLResourceOptions::StorageModePrivate,
+            BufferType::HostVisible => {
+                if self.unified_memory {
+                    metal::MTLResourceOptions::StorageModeShared
+                } else {
+                    metal::MTLResourceOptions::StorageModeManaged
+                }
+            }
+        };
+        autoreleasepool(|| self.device.new_buffer(size, options))
+    }
+
+    fn flush_buffer_to_device(&self, buffer: &metal::Buffer, size: usize) {
+        if !self.unified_memory {
+            let range = metal::NSRange {
+                location: 0,
+                length: size as u64,
+            };
+            buffer.did_modify_range(range);
+        }
+    }
+
+    fn flush_buffer_to_host(&self, buffer: &metal::Buffer) {
+        if !&self.unified_memory {
+            let command_buffer = self.command_queue.new_command_buffer();
+            let blit_encoder = command_buffer.new_blit_command_encoder();
+            blit_encoder.synchronize_resource(buffer);
+            blit_encoder.end_encoding();
+            command_buffer.commit();
+            command_buffer.wait_until_completed();
+        }
+    }
+
+    fn create_pipelines(
+        device: &metal::Device,
+    ) -> Result<HashMap<ShaderModuleType, metal::ComputePipelineState>, Box<dyn error::Error>> {
+        autoreleasepool(|| {
+            let mut result = HashMap::new();
+            // TODO: switch to a precompiled library and use new_library_with_data
+            let source = fs::read_to_string("correlation.metal")?;
+            let library =
+                device.new_library_with_source(source.as_str(), &metal::CompileOptions::new())?;
+            for module_type in ShaderModuleType::VALUES {
+                let function = module_type.load(&library)?;
+                let pipeline = device.new_compute_pipeline_state_with_function(&function)?;
+                result.insert(module_type, pipeline);
+            }
+            Ok(result)
+        })
+    }
+
+    fn set_buffer_layout(
+        &self,
+        shader: &ShaderModuleType,
+        command_encoder: &metal::ComputeCommandEncoderRef,
+    ) -> Result<(), GpuError> {
+        let buffers = &self.buffers()?;
+        let (buffer_internal_img1, buffer_internal_img2, buffer_out, buffer_out_reverse) =
+            match self.direction {
+                CorrelationDirection::Forward => (
+                    &buffers.buffer_internal_img1,
+                    &buffers.buffer_internal_img2,
+                    &buffers.buffer_out,
+                    &buffers.buffer_out_reverse,
+                ),
+                CorrelationDirection::Reverse => (
+                    &buffers.buffer_internal_img2,
+                    &buffers.buffer_internal_img1,
+                    &buffers.buffer_out_reverse,
+                    &buffers.buffer_out,
+                ),
+            };
+        // Index 0 is reserved for push_constants.
+        if matches!(shader, ShaderModuleType::CrossCheckFilter) {
+            command_encoder.set_buffer(1, Some(buffer_out), 0);
+            command_encoder.set_buffer(2, Some(buffer_out_reverse), 0);
+            return Ok(());
+        }
+
+        command_encoder.set_buffer(1, Some(&buffers.buffer_img), 0);
+        command_encoder.set_buffer(2, Some(buffer_internal_img1), 0);
+        command_encoder.set_buffer(3, Some(buffer_internal_img2), 0);
+        command_encoder.set_buffer(4, Some(&buffers.buffer_internal_int), 0);
+        command_encoder.set_buffer(5, Some(buffer_out), 0);
+        command_encoder.set_buffer(6, Some(&buffers.buffer_out_corr), 0);
+        Ok(())
+    }
 }
 
-impl GpuError {
-    fn new(msg: &'static str) -> GpuError {
-        GpuError { msg }
+impl super::Device for Device {
+    fn set_buffer_direction(&mut self, direction: &CorrelationDirection) -> Result<(), GpuError> {
+        self.direction = direction.to_owned();
+        Ok(())
+    }
+
+    unsafe fn run_shader(
+        &mut self,
+        dimensions: (usize, usize),
+        shader_type: ShaderModuleType,
+        shader_params: ShaderParams,
+    ) -> Result<(), Box<dyn error::Error>> {
+        let workgroup_size = ((dimensions.0 + 15) / 16, ((dimensions.1 + 15) / 16));
+        autoreleasepool(|| {
+            let pipeline = self.pipelines.get(&shader_type).unwrap();
+            let command_buffer = self.command_queue.new_command_buffer();
+
+            let compute_encoder = command_buffer.new_compute_command_encoder();
+            compute_encoder.set_compute_pipeline_state(pipeline);
+
+            let push_constants_data = slice::from_raw_parts(
+                &shader_params as *const ShaderParams as *const u8,
+                std::mem::size_of::<ShaderParams>(),
+            );
+            compute_encoder.set_bytes(
+                0,
+                push_constants_data.len() as u64,
+                push_constants_data.as_ptr() as *const c_void,
+            );
+            self.set_buffer_layout(&shader_type, compute_encoder)?;
+
+            let thread_group_count = metal::MTLSize {
+                width: workgroup_size.0 as u64,
+                height: workgroup_size.1 as u64,
+                depth: 1,
+            };
+            let thread_group_size = metal::MTLSize {
+                width: 16,
+                height: 16,
+                depth: 1,
+            };
+
+            compute_encoder.dispatch_thread_groups(thread_group_count, thread_group_size);
+            compute_encoder.end_encoding();
+
+            command_buffer.commit();
+            command_buffer.wait_until_completed();
+
+            Ok(())
+        })
+    }
+
+    unsafe fn transfer_in_images(
+        &self,
+        img1: &Grid<u8>,
+        img2: &Grid<u8>,
+    ) -> Result<(), Box<dyn error::Error>> {
+        autoreleasepool(|| {
+            let buffers = self.buffers()?;
+            let buffer = &buffers.buffer_img;
+            let buffer_contents = buffer.contents();
+            let img2_offset = img1.width() * img1.height();
+            let size = img1.width() * img1.height() + img2.width() * img2.height();
+            let img_slice = slice::from_raw_parts_mut(buffer_contents as *mut f32, size);
+            img1.iter()
+                .for_each(|(x, y, val)| img_slice[y * img1.width() + x] = *val as f32);
+            img2.iter().for_each(|(x, y, val)| {
+                img_slice[img2_offset + y * img2.width() + x] = *val as f32
+            });
+
+            self.flush_buffer_to_device(buffer, std::mem::size_of_val(img_slice));
+            Ok(())
+        })
+    }
+
+    unsafe fn save_corr(
+        &self,
+        correlation_values: &mut Grid<Option<f32>>,
+        correlation_threshold: f32,
+    ) -> Result<(), Box<dyn error::Error>> {
+        autoreleasepool(|| {
+            let buffers = self.buffers()?;
+            let buffer = &buffers.buffer_out_corr;
+            self.flush_buffer_to_host(buffer);
+            let buffer_contents = buffer.contents();
+            let size = correlation_values.width() * correlation_values.height();
+            let width = correlation_values.width();
+            let out_corr = slice::from_raw_parts_mut(buffer_contents as *mut f32, size);
+            correlation_values
+                .par_iter_mut()
+                .for_each(|(x, y, out_point)| {
+                    let corr = out_corr[y * width + x];
+                    if corr > correlation_threshold {
+                        *out_point = Some(corr);
+                    }
+                });
+
+            Ok(())
+        })
+    }
+
+    unsafe fn save_result(
+        &self,
+        out_image: &mut Grid<Option<Match>>,
+        correlation_values: &Grid<Option<f32>>,
+    ) -> Result<(), Box<dyn error::Error>> {
+        autoreleasepool(|| {
+            let buffers = self.buffers()?;
+            let buffer = &buffers.buffer_out;
+            self.flush_buffer_to_host(buffer);
+            let buffer_contents = buffer.contents();
+            let size = out_image.width() * out_image.height() * 2;
+            let width = out_image.width();
+            let out_data = slice::from_raw_parts_mut(buffer_contents as *mut i32, size);
+            out_image.par_iter_mut().for_each(|(x, y, out_point)| {
+                let pos = 2 * (y * width + x);
+                let (match_x, match_y) = (out_data[pos], out_data[pos + 1]);
+                if let Some(corr) = correlation_values.val(x, y) {
+                    *out_point = if match_x > 0 && match_y > 0 {
+                        let point_match = Point2D::new(match_x as u32, match_y as u32);
+                        Some((point_match, *corr))
+                    } else {
+                        None
+                    };
+                } else {
+                    *out_point = None;
+                };
+            });
+
+            Ok(())
+        })
+    }
+
+    unsafe fn destroy_buffers(&mut self) {
+        autoreleasepool(|| {
+            self.buffers = None;
+        })
     }
 }
 
-impl std::error::Error for GpuError {}
+impl ShaderModuleType {
+    const VALUES: [Self; 6] = [
+        Self::InitOutData,
+        Self::PrepareInitialdataSearchdata,
+        Self::PrepareInitialdataCorrelation,
+        Self::PrepareSearchdata,
+        Self::CrossCorrelate,
+        Self::CrossCheckFilter,
+    ];
+
+    fn load(&self, library: &metal::Library) -> Result<metal::Function, Box<dyn error::Error>> {
+        let function_name = match self {
+            ShaderModuleType::InitOutData => "init_out_data",
+            ShaderModuleType::PrepareInitialdataSearchdata => "prepare_initialdata_searchdata",
+            ShaderModuleType::PrepareInitialdataCorrelation => "prepare_initialdata_correlation",
+            ShaderModuleType::PrepareSearchdata => "prepare_searchdata",
+            ShaderModuleType::CrossCorrelate => "cross_correlate",
+            ShaderModuleType::CrossCheckFilter => "cross_check_filter",
+        };
+        let function = library.get_function(function_name, None)?;
 
-impl fmt::Display for GpuError {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "{}", self.msg)
+        Ok(function)
     }
 }
diff --git a/src/correlation/gpu/mod.rs b/src/correlation/gpu/mod.rs
index ca47f18..c9bae20 100644
--- a/src/correlation/gpu/mod.rs
+++ b/src/correlation/gpu/mod.rs
@@ -3,15 +3,19 @@ mod metal;
 #[cfg(not(target_os = "macos"))]
 mod vulkan;
 
-use std::{error, fmt};
+#[cfg(target_os = "macos")]
+use metal::ShaderModuleType;
 #[cfg(not(target_os = "macos"))]
 use vulkan::ShaderModuleType;
 
+#[cfg(target_os = "macos")]
+pub type DefaultDeviceContext = metal::DeviceContext;
 #[cfg(not(target_os = "macos"))]
 pub type DefaultDeviceContext = vulkan::DeviceContext;
 
 use crate::data::Grid;
 use nalgebra::Matrix3;
+use std::{error, fmt};
 
 use crate::correlation::{
     CorrelationDirection, CorrelationParameters, HardwareMode, ProjectionMode, CORRIDOR_MIN_RANGE,
@@ -27,6 +31,8 @@ const CORRIDOR_SEGMENT_LENGTH_LOWPOWER: usize = 8;
 const SEARCH_AREA_SEGMENT_LENGTH_LOWPOWER: usize = 128;
 
 trait Device {
+    fn set_buffer_direction(&mut self, direction: &CorrelationDirection) -> Result<(), GpuError>;
+
     unsafe fn run_shader(
         &mut self,
         dimensions: (usize, usize),
@@ -59,6 +65,8 @@ trait DeviceContext<D>
 where
     D: Device,
 {
+    fn convert_fundamental_matrix(fundamental_matrix: &Matrix3<f64>) -> [f32; 3 * 4];
+
     fn is_low_power(&self) -> bool;
 
     fn get_device_name(&self) -> Option<String>;
@@ -224,7 +232,7 @@ impl GpuContext<'_> {
         dir: CorrelationDirection,
     ) -> Result<(), Box<dyn error::Error>> {
         {
-            let device = self.device_context.device()?;
+            let device = self.device_context.device_mut()?;
             device.set_buffer_direction(&dir)?;
         }
         let max_width = img1.width().max(img2.width());
@@ -385,17 +393,7 @@ impl GpuContext<'_> {
             CorrelationDirection::Forward => self.fundamental_matrix,
             CorrelationDirection::Reverse => self.fundamental_matrix.transpose(),
         };
-        let mut f = [0f32; 3 * 4];
-        // Matrix layout in GLSL (OpenGL) is pure madness: https://www.opengl.org/archives/resources/faq/technical/transformations.htm.
-        // "Column major" means that vectors are vertical and a matrix multiplies a vector.
-        // "Row major" means a horizontal vector multiplies a matrix.
-        // This says nothing about how the matrix is stored in memory.
-        for row in 0..3 {
-            for col in 0..3 {
-                f[col * 4 + row] = fundamental_matrix[(row, col)] as f32;
-            }
-        }
-        f
+        DefaultDeviceContext::convert_fundamental_matrix(&fundamental_matrix)
     }
 }
 
diff --git a/src/correlation/gpu/shaders/correlation.metal b/src/correlation/gpu/shaders/correlation.metal
index 5f61074..99c0fb9 100644
--- a/src/correlation/gpu/shaders/correlation.metal
+++ b/src/correlation/gpu/shaders/correlation.metal
@@ -1,3 +1,392 @@
-kernel void add_arrays(uint index [[thread_position_in_grid]])
+#include <metal_stdlib>
+
+using namespace metal;
+
+struct Parameters
 {
+    uint img1_width;
+    uint img1_height;
+    uint img2_width;
+    uint img2_height;
+    uint out_width;
+    uint out_height;
+    float scale;
+    uint iteration_pass;
+    float3x3 fundamental_matrix;
+    int corridor_offset;
+    uint corridor_start;
+    uint corridor_end;
+    uint kernel_size;
+    float threshold;
+    float min_stdev;
+    uint neighbor_distance;
+    float extend_range;
+    float min_range;
+};
+
+kernel void init_out_data(const device Parameters& p [[buffer(0)]], device int2* result_matches[[buffer(5)]], device float* result_corr[[buffer(6)]], uint2 index [[thread_position_in_grid]]) {
+    const uint x = index.x;
+    const uint y = index.y;
+
+    const uint out_width = p.out_width;
+    const uint out_height = p.out_height;
+
+    if (x < out_width && y < out_height) {
+        result_matches[out_width*y+x] = int2(-1, -1);
+        result_corr[out_width*y+x] = -1.0;
+    }
+}
+
+kernel void prepare_initialdata_searchdata(const device Parameters& p [[buffer(0)]], device float2* internals_img1[[buffer(2)]], device int3* internals_int[[buffer(4)]], device float* result_corr[[buffer(6)]], uint2 index [[thread_position_in_grid]]) {
+    const uint x = index.x;
+    const uint y = index.y;
+
+    const uint out_width = p.out_width;
+    const uint out_height = p.out_height;
+    const uint img1_width = p.img1_width;
+    const uint img1_height = p.img1_height;
+
+    if (x < img1_width && y < img1_height) {
+        internals_img1[img1_width*y+x] = float2(0.0, 0.0);
+        internals_int[img1_width*y+x] = int3(-1, -1, 0);
+    }
+    if (x < out_width && y < out_height) {
+        result_corr[out_width*y+x] = -1.0;
+    }
+}
+
+kernel void prepare_initialdata_correlation(const device Parameters& p [[buffer(0)]], const device float* images[[buffer(1)]], device float2* internals_img1[[buffer(2)]], device float2* internals_img2[[buffer(3)]], uint2 index [[thread_position_in_grid]]) {
+    const uint x = index.x;
+    const uint y = index.y;
+
+    const uint img1_width = p.img1_width;
+    const uint img1_height = p.img1_height;
+    const uint img2_width = p.img2_width;
+    const uint img2_height = p.img2_height;
+    const uint kernel_size = p.kernel_size;
+    const uint kernel_width = kernel_size*2+1;
+    const float kernel_point_count = float(kernel_width*kernel_width);
+
+    const uint img1_offset = 0;
+    const uint img2_offset = img1_width*img1_height;
+
+    if (x >= kernel_size && x < img1_width-kernel_size && y >= kernel_size && y < img1_height-kernel_size) {
+        float avg = 0.0;
+        float stdev = 0.0;
+
+        for (uint j=0;j<kernel_width;j++) {
+            for (uint i=0;i<kernel_width;i++) {
+                float value = images[img1_offset + (y+j-kernel_size)*img1_width+(x+i-kernel_size)];
+                avg += value;
+            }
+        }
+        avg /= kernel_point_count;
+
+        for (uint j=0;j<kernel_width;j++) {
+            for (uint i=0;i<kernel_width;i++) {
+                float value = images[img1_offset + (y+j-kernel_size)*img1_width+(x+i-kernel_size)];
+                float delta = value-avg;
+                stdev += delta*delta;
+            }
+        }
+        stdev = sqrt(stdev/kernel_point_count);
+        internals_img1[img1_width*y+x] = float2(avg, stdev);
+    } else if (x < img1_width && y < img1_height) {
+        internals_img1[img1_width*y+x] = float2(0.0, -1.0);
+    }
+
+    if (x >= kernel_size && x < img2_width-kernel_size && y >= kernel_size && y < img2_height-kernel_size) {
+        float avg = 0.0;
+        float stdev = 0.0;
+
+        for (uint j=0;j<kernel_width;j++) {
+            for (uint i=0;i<kernel_width;i++) {
+                float value = images[img2_offset + (y+j-kernel_size)*img2_width+(x+i-kernel_size)];
+                avg += value;
+            }
+        }
+        avg /= kernel_point_count;
+
+        for (uint j=0;j<kernel_width;j++) {
+            for (uint i=0;i<kernel_width;i++) {
+                float value = images[img2_offset + (y+j-kernel_size)*img2_width+(x+i-kernel_size)];
+                float delta = value-avg;
+                stdev += delta*delta;
+            }
+        }
+        stdev = sqrt(stdev/kernel_point_count);
+        internals_img2[img2_width*y+x] = float2(avg, stdev);
+    } else if (x < img2_width && y < img2_height) {
+        internals_img2[img2_width*y+x] = float2(0.0, -1.0);
+    }
+}
+
+void calculate_epipolar_line(const device Parameters& p, const uint x1, const uint y1, thread float2& coeff, thread float2& add_const) {
+    const float scale = p.scale;
+    const float3 p1 = float3(float(x1)/scale, float(y1)/scale, 1.0);
+    const float3 f_p1 = p.fundamental_matrix*p1;
+    if (abs(f_p1.x)>abs(f_p1.y)) {
+        coeff = float2(-f_p1.y/f_p1.x, 1.0);
+        add_const = float2(-scale*f_p1.z/f_p1.x, 0.0);
+    } else {
+        coeff = float2(1.0, -f_p1.x/f_p1.y);
+        add_const = float2(0.0, -scale*f_p1.z/f_p1.y);
+    }
+}
+
+kernel void prepare_searchdata(const device Parameters& p [[buffer(0)]], device float2* internals_img1[[buffer(2)]], device int3* internals_int[[buffer(4)]], const device int2* result_matches[[buffer(5)]], uint2 index [[thread_position_in_grid]]) {
+    const uint x = index.x;
+    const uint y = index.y;
+
+    const uint img1_width = p.img1_width;
+    const uint img1_height = p.img1_height;
+
+    if (x >= img1_width || y >= img1_height) {
+        return;
+    }
+
+    const uint img2_width = p.img2_width;
+    const uint img2_height = p.img2_height;
+    const uint out_width = p.out_width;
+    const uint out_height = p.out_height;
+    const float scale = p.scale;
+    const uint kernel_size = p.kernel_size;
+    const uint kernel_width = kernel_size*2+1;
+
+    const int x_signed = int(x);
+    const int y_signed = int(y);
+
+    const bool first_pass = p.iteration_pass == 0;
+    const int neighbor_distance_signed = int(p.neighbor_distance);
+    const float extend_range = p.extend_range;
+    const float min_range = p.min_range;
+
+    const int corridor_start_signed = int(p.corridor_start);
+    const int corridor_end_signed = int(p.corridor_end);
+    const int out_neighbor_width = int(ceil(float(p.neighbor_distance)/scale))*2+1;
+
+    const int start_pos_x = int(floor(float(x_signed-neighbor_distance_signed)/scale));
+    const int start_pos_y = int(floor(float(y_signed-neighbor_distance_signed)/scale));
+
+    int3 data_int = internals_int[y*img1_width+x];
+    int neighbor_count = data_int[2];
+    if (!first_pass && neighbor_count<=0) {
+        return;
+    }
+
+    float2 data = internals_img1[y*img1_width+x];
+    float mid_corridor = data[0];
+    float range_stdev = data[1];
+    if (!first_pass) {
+        mid_corridor /= float(neighbor_count);
+    }
+
+    float2 coeff;
+    float2 add_const;
+    calculate_epipolar_line(p, x, y, coeff, add_const);
+    const bool corridor_vertical = abs(coeff.y) > abs(coeff.x);
+    for (int i=corridor_start_signed;i<corridor_end_signed;i++) {
+        int x_out = start_pos_x + (i%out_neighbor_width);
+        int y_out = start_pos_y + (i/out_neighbor_width);
+
+        if (x_out <=0 || y_out<=0 || x_out>=int(out_width) || y_out>=int(out_height)) {
+            continue;
+        }
+
+        float2 coord2 = float2(result_matches[uint(y_out)*out_width + uint(x_out)]) * scale;
+        if (coord2.x<0.0 || coord2.y<0.0) {
+            continue;
+        }
+
+        float corridor_pos;
+        if (corridor_vertical) {
+            corridor_pos = round((coord2.y-add_const.y)/coeff.y);
+        } else {
+            corridor_pos = round((coord2.x-add_const.x)/coeff.x);
+        }
+        if (first_pass) {
+            mid_corridor += corridor_pos;
+            neighbor_count++;
+        } else {
+            float delta = corridor_pos-mid_corridor;
+            range_stdev += delta*delta;
+        }
+    }
+
+    if (first_pass) {
+        data[0] = mid_corridor;
+        internals_img1[y*img1_width+x] = data;
+        data_int[2] = neighbor_count;
+        internals_int[y*img1_width+x] = data_int;
+        return;
+    }
+
+    data[1] = range_stdev;
+    internals_img1[y*img1_width+x] = data;
+    range_stdev = sqrt(range_stdev/float(neighbor_count));
+    const int corridor_center = int(round(mid_corridor));
+    const int corridor_length = int(round(min_range+range_stdev*extend_range));
+    int min_pos = int(kernel_size);
+    int max_pos;
+    if (corridor_vertical) {
+        max_pos = int(img2_height-kernel_size);
+    } else {
+        max_pos = int(img2_width-kernel_size);
+    }
+    min_pos = clamp(corridor_center - corridor_length, min_pos, max_pos);
+    max_pos = clamp(corridor_center + corridor_length, min_pos, max_pos);
+    internals_int[y*img1_width+x] = int3(min_pos, max_pos, neighbor_count);
+}
+
+kernel void cross_correlate(const device Parameters& p [[buffer(0)]], const device float* images[[buffer(1)]], const device float2* internals_img1[[buffer(2)]], const device float2* internals_img2[[buffer(3)]], const device int3* internals_int[[buffer(4)]], device int2* result_matches[[buffer(5)]], device float* result_corr[[buffer(6)]], uint2 index [[thread_position_in_grid]]) {
+    const uint x1 = index.x;
+    const uint y1 = index.y;
+
+    const uint img1_width = p.img1_width;
+    const uint img1_height = p.img1_height;
+    const uint kernel_size = p.kernel_size;
+
+    if (x1 < kernel_size || x1 >= img1_width-kernel_size || y1 < kernel_size ||  y1 >= img1_height-kernel_size) {
+        return;
+    }
+
+    const uint img2_width = p.img2_width;
+    const uint img2_height = p.img2_height;
+    const uint out_width = p.out_width;
+    const float scale = p.scale;
+    const bool first_iteration = p.iteration_pass == 0;
+    const int corridor_offset = p.corridor_offset;
+    const float threshold = p.threshold;
+    const float min_stdev = p.min_stdev;
+    const uint kernel_width = kernel_size*2+1;
+    const float kernel_point_count = float(kernel_width*kernel_width);
+
+    const uint img1_offset = 0;
+    const uint img2_offset = img1_width*img1_height;
+
+    const float2 data_img1 = internals_img1[img1_width*y1+x1];
+    const float avg1 = data_img1[0];
+    const float stdev1 = data_img1[1];
+    if (stdev1 < min_stdev) {
+        return;
+    }
+
+    float best_corr = 0.0;
+    int2 best_match = int2(-1, -1);
+
+    uint corridor_start = kernel_size + p.corridor_start;
+    uint corridor_end = kernel_size + p.corridor_end;
+    if (!first_iteration) {
+        const int3 data_int = internals_int[img1_width*y1 + x1];
+        int min_pos_signed = data_int[0];
+        int max_pos_signed = data_int[1];
+        if (min_pos_signed<0 || max_pos_signed<0) {
+            return;
+        }
+        const uint min_pos = uint(min_pos_signed);
+        const uint max_pos = uint(max_pos_signed);
+        corridor_start = clamp(min_pos+p.corridor_start, min_pos, max_pos);
+        corridor_end = clamp(min_pos+p.corridor_end, min_pos, max_pos);
+        if (corridor_start >= corridor_end) {
+            return;
+        }
+    }
+
+    float2 coeff;
+    float2 add_const;
+    calculate_epipolar_line(p, x1, y1, coeff, add_const);
+    int2 corridor_offset_vector;
+    if (abs(add_const.x) > abs(add_const.y)) {
+        corridor_offset_vector = int2(corridor_offset, 0);
+    } else {
+        corridor_offset_vector = int2(0, corridor_offset);
+    }
+    for (uint corridor_pos=corridor_start;corridor_pos<corridor_end;corridor_pos++) {
+        const int x2_signed = int(floor(coeff.x*float(corridor_pos)+add_const.x)) + corridor_offset_vector.x;
+        const int y2_signed = int(floor(coeff.y*float(corridor_pos)+add_const.y)) + corridor_offset_vector.y;
+        if (x2_signed < 0 || y2_signed < 0) {
+            continue;
+        }
+        const uint x2 = uint(x2_signed);
+        const uint y2 = uint(y2_signed);
+        if (x2 < kernel_size || x2 >= img2_width-kernel_size || y2 < kernel_size ||  y2 >= img2_height-kernel_size) {
+            continue;
+        }
+
+        const float2 data_img2 = internals_img2[img2_width*y2 + x2];
+        const float avg2 = data_img2[0];
+        const float stdev2 = data_img2[1];
+        if (stdev2 < min_stdev) {
+            continue;
+        }
+
+        float corr = 0.0;
+        for (uint j=0;j<kernel_width;j++) {
+            for (uint i=0;i<kernel_width;i++) {
+                float delta1 = images[img1_offset + (y1+j-kernel_size)*img1_width+(x1+i-kernel_size)] - avg1;
+                float delta2 = images[img2_offset + (y2+j-kernel_size)*img2_width+(x2+i-kernel_size)] - avg2;
+                corr += delta1*delta2;
+            }
+        }
+        corr = corr/(stdev1*stdev2*kernel_point_count);
+
+        if (corr >= threshold && corr > best_corr) {
+            best_match.x = int(round(float(x2)/scale));
+            best_match.y = int(round(float(y2)/scale));
+            best_corr = corr;
+        }
+    }
+
+    uint out_pos = out_width*uint((float(y1)/scale)) + uint(float(x1)/scale);
+    float current_corr = result_corr[out_pos];
+    if (best_corr >= threshold && best_corr > current_corr)
+    {
+        result_matches[out_pos] = best_match;
+        result_corr[out_pos] = best_corr;
+    }
+}
+
+kernel void cross_check_filter(const device Parameters& p [[buffer(0)]], device int2* img1[[buffer(1)]], const device int2* img2[[buffer(2)]], uint2 index [[thread_position_in_grid]]) {
+    const uint x1 = index.x;
+    const uint y1 = index.y;
+
+    const uint img1_width = p.img1_width;
+    const uint img1_height = p.img1_height;
+
+    if (x1 >= img1_width || y1 >= img1_height) {
+        return;
+    }
+
+    const uint img2_width = p.img2_width;
+    const uint img2_height = p.img2_height;
+    const int search_area = int(p.neighbor_distance);
+
+    const int2 point = img1[img1_width*y1+x1];
+    if (point.x < 0 || point.y < 0 || uint(point.x) >= img2_width || uint(point.y) >= img2_height) {
+        return;
+    }
+
+    const uint min_x = uint(clamp(point.x-search_area, 0, int(img2_width)));
+    const uint max_x = uint(clamp(point.x+search_area+1, 0, int(img2_width)));
+    const uint min_y = uint(clamp(point.y-search_area, 0, int(img2_height)));
+    const uint max_y = uint(clamp(point.y+search_area+1, 0, int(img2_height)));
+
+    const int r_min_x = clamp(int(x1)-search_area, 0, int(img1_width));
+    const int r_max_x = clamp(int(x1)+search_area+1, 0, int(img1_width));
+    const int r_min_y = clamp(int(y1)-search_area, 0, int(img1_height));
+    const int r_max_y = clamp(int(y1)+search_area+1, 0, int(img1_height));
+
+    for (uint y2=min_y;y2<max_y;y2++) {
+        for (uint x2=min_x;x2<max_x;x2++) {
+            const int2 rpoint = img2[img2_width*y2+x2];
+            if (rpoint.x < 0 || rpoint.y < 0) {
+                continue;
+            }
+            if (rpoint.x >= r_min_x && rpoint.x < r_max_x && rpoint.y >= r_min_y && rpoint.y < r_max_y) {
+                return;
+            }
+        }
+    }
+
+    img1[img1_width*y1+x1] = int2(-1, -1);
 }
diff --git a/src/correlation/gpu/vulkan.rs b/src/correlation/gpu/vulkan.rs
index c6d32b6..811ef2f 100644
--- a/src/correlation/gpu/vulkan.rs
+++ b/src/correlation/gpu/vulkan.rs
@@ -1,6 +1,8 @@
 use std::{cmp::Ordering, collections::HashMap, error, ffi::CStr, slice};
 
+use super::Device as GpuDevice;
 use ash::{prelude::VkResult, vk};
+use nalgebra::Matrix3;
 use rayon::iter::ParallelIterator;
 
 use crate::{
@@ -113,6 +115,20 @@ impl DeviceContext {
 }
 
 impl super::DeviceContext<Device> for DeviceContext {
+    fn convert_fundamental_matrix(fundamental_matrix: &Matrix3<f64>) -> [f32; 3 * 4] {
+        let mut f = [0f32; 3 * 4];
+        // Matrix layout in GLSL (OpenGL) is pure madness: https://www.opengl.org/archives/resources/faq/technical/transformations.htm.
+        // "Column major" means that vectors are vertical and a matrix multiplies a vector.
+        // "Row major" means a horizontal vector multiplies a matrix.
+        // This says nothing about how the matrix is stored in memory.
+        for row in 0..3 {
+            for col in 0..3 {
+                f[col * 4 + row] = fundamental_matrix[(row, col)] as f32;
+            }
+        }
+        f
+    }
+
     fn is_low_power(&self) -> bool {
         self.low_power
     }
@@ -765,63 +781,6 @@ impl Device {
         })
     }
 
-    pub fn set_buffer_direction(&self, direction: &CorrelationDirection) -> Result<(), GpuError> {
-        let descriptor_sets = &self.descriptor_sets;
-        let buffers = &self.buffers()?;
-        let create_buffer_infos = |buffers: &[Buffer]| {
-            buffers
-                .iter()
-                .map(|buf| {
-                    vk::DescriptorBufferInfo::builder()
-                        .buffer(buf.buffer)
-                        .offset(0)
-                        .range(vk::WHOLE_SIZE)
-                        .build()
-                })
-                .collect::<Vec<_>>()
-        };
-        let create_write_descriptor = |i: usize, buffer_infos: &[vk::DescriptorBufferInfo]| {
-            vk::WriteDescriptorSet::builder()
-                .dst_set(descriptor_sets.descriptor_sets[i])
-                .dst_binding(0)
-                .descriptor_type(vk::DescriptorType::STORAGE_BUFFER)
-                .buffer_info(buffer_infos)
-                .build()
-        };
-        let (buffer_internal_img1, buffer_internal_img2, buffer_out, buffer_out_reverse) =
-            match direction {
-                CorrelationDirection::Forward => (
-                    buffers.buffer_internal_img1,
-                    buffers.buffer_internal_img2,
-                    buffers.buffer_out,
-                    buffers.buffer_out_reverse,
-                ),
-                CorrelationDirection::Reverse => (
-                    buffers.buffer_internal_img2,
-                    buffers.buffer_internal_img1,
-                    buffers.buffer_out_reverse,
-                    buffers.buffer_out,
-                ),
-            };
-        let regular_buffer_infos = create_buffer_infos(&[
-            buffers.buffer_img,
-            buffer_internal_img1,
-            buffer_internal_img2,
-            buffers.buffer_internal_int,
-            buffer_out,
-            buffers.buffer_out_corr,
-        ]);
-        let cross_check_buffer_infos = create_buffer_infos(&[buffer_out, buffer_out_reverse]);
-        let write_descriptors = [
-            create_write_descriptor(0, regular_buffer_infos.as_slice()),
-            create_write_descriptor(1, cross_check_buffer_infos.as_slice()),
-        ];
-        unsafe {
-            self.device.update_descriptor_sets(&write_descriptors, &[]);
-        }
-        Ok(())
-    }
-
     unsafe fn load_shaders(
         device: &ash::Device,
     ) -> Result<Vec<(ShaderModuleType, vk::ShaderModule)>, Box<dyn error::Error>> {
@@ -933,6 +892,63 @@ impl Device {
 }
 
 impl super::Device for Device {
+    fn set_buffer_direction(&mut self, direction: &CorrelationDirection) -> Result<(), GpuError> {
+        let descriptor_sets = &self.descriptor_sets;
+        let buffers = &self.buffers()?;
+        let create_buffer_infos = |buffers: &[Buffer]| {
+            buffers
+                .iter()
+                .map(|buf| {
+                    vk::DescriptorBufferInfo::builder()
+                        .buffer(buf.buffer)
+                        .offset(0)
+                        .range(vk::WHOLE_SIZE)
+                        .build()
+                })
+                .collect::<Vec<_>>()
+        };
+        let create_write_descriptor = |i: usize, buffer_infos: &[vk::DescriptorBufferInfo]| {
+            vk::WriteDescriptorSet::builder()
+                .dst_set(descriptor_sets.descriptor_sets[i])
+                .dst_binding(0)
+                .descriptor_type(vk::DescriptorType::STORAGE_BUFFER)
+                .buffer_info(buffer_infos)
+                .build()
+        };
+        let (buffer_internal_img1, buffer_internal_img2, buffer_out, buffer_out_reverse) =
+            match direction {
+                CorrelationDirection::Forward => (
+                    buffers.buffer_internal_img1,
+                    buffers.buffer_internal_img2,
+                    buffers.buffer_out,
+                    buffers.buffer_out_reverse,
+                ),
+                CorrelationDirection::Reverse => (
+                    buffers.buffer_internal_img2,
+                    buffers.buffer_internal_img1,
+                    buffers.buffer_out_reverse,
+                    buffers.buffer_out,
+                ),
+            };
+        let regular_buffer_infos = create_buffer_infos(&[
+            buffers.buffer_img,
+            buffer_internal_img1,
+            buffer_internal_img2,
+            buffers.buffer_internal_int,
+            buffer_out,
+            buffers.buffer_out_corr,
+        ]);
+        let cross_check_buffer_infos = create_buffer_infos(&[buffer_out, buffer_out_reverse]);
+        let write_descriptors = [
+            create_write_descriptor(0, regular_buffer_infos.as_slice()),
+            create_write_descriptor(1, cross_check_buffer_infos.as_slice()),
+        ];
+        unsafe {
+            self.device.update_descriptor_sets(&write_descriptors, &[]);
+        }
+        Ok(())
+    }
+
     unsafe fn run_shader(
         &mut self,
         dimensions: (usize, usize),