diff --git a/Cargo.toml b/Cargo.toml index dc4c097..9312c0e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,7 +23,7 @@ kamadak-exif = { version = "0.5", default-features = false } ash = { version = "0.37", default-features = false, features = ["loaded"] } [target.'cfg(target_os = "macos")'.dependencies] -metal = { version = "0.27", default-features = false } +metal = { version = "0.27", default-features = false, features = ["link"] } [profile.release] strip = true diff --git a/src/correlation/gpu/correlation.wgsl b/src/correlation/gpu/correlation.wgsl deleted file mode 100644 index e6ffc61..0000000 --- a/src/correlation/gpu/correlation.wgsl +++ /dev/null @@ -1,405 +0,0 @@ -struct Parameters -{ - img1_width: u32, - img1_height: u32, - img2_width: u32, - img2_height: u32, - out_width: u32, - out_height: u32, - scale: f32, - iteration_pass: u32, - fundamental_matrix: mat3x3, - corridor_offset: i32, - corridor_start: u32, - corridor_end: u32, - kernel_size: u32, - threshold: f32, - min_stdev: f32, - neighbor_distance: u32, - extend_range: f32, - min_range: f32, -}; - -var parameters: Parameters; -// Array of image data: [img1, img2] -@group(0) @binding(0) var images: array; -// For searchdata: contains [min_corridor, stdev] for image1 -// For cross_correlate: contains [avg, stdev] for image1 -@group(0) @binding(1) var internals_img1: array>; -// Contains [avg, stdev] for image 2 -@group(0) @binding(2) var internals_img2: array>; -// Contains [min, max, neighbor_count] for the corridor range -@group(0) @binding(3) var internals_int: array>; -@group(0) @binding(4) var result_matches: array>; -@group(0) @binding(5) var result_corr: array; - -@compute @workgroup_size(16, 16, 1) -fn init_out_data(@builtin(global_invocation_id) global_id: vec3) { - let x = global_id.x; - let y = global_id.y; - - let out_width = parameters.out_width; - let out_height = parameters.out_height; - - if x < out_width && y < out_height { - result_matches[out_width*y+x] = vec2(-1, -1); - result_corr[out_width*y+x] = -1.0; - } -} - -@compute @workgroup_size(16, 16, 1) -fn prepare_initialdata_searchdata(@builtin(global_invocation_id) global_id: vec3) { - let x = global_id.x; - let y = global_id.y; - - let out_width = parameters.out_width; - let out_height = parameters.out_height; - let img1_width = parameters.img1_width; - let img1_height = parameters.img1_height; - - if x < img1_width && y < img1_height { - internals_img1[img1_width*y+x] = vec2(0.0, 0.0); - internals_int[img1_width*y+x] = vec3(-1, -1, 0); - } - if x < out_width && y < out_height { - result_corr[out_width*y+x] = -1.0; - } -} - -@compute @workgroup_size(16, 16, 1) -fn prepare_initialdata_correlation(@builtin(global_invocation_id) global_id: vec3) { - let x = global_id.x; - let y = global_id.y; - - let img1_width = parameters.img1_width; - let img1_height = parameters.img1_height; - let img2_width = parameters.img2_width; - let img2_height = parameters.img2_height; - let kernel_size = parameters.kernel_size; - let kernel_width = kernel_size*2u+1u; - let kernel_point_count = f32(kernel_width*kernel_width); - - let img1_offset = 0u; - let img2_offset = img1_width*img1_height; - - if x >= kernel_size && x < img1_width-kernel_size && y >= kernel_size && y < img1_height-kernel_size { - var avg = 0.0; - var stdev = 0.0; - - for (var j=0u;j= kernel_size && x < img2_width-kernel_size && y >= kernel_size && y < img2_height-kernel_size { - var avg = 0.0; - var stdev = 0.0; - - for (var j=0u;j>, add_const: ptr>) { - let scale = parameters.scale; - let p1 = vec3(f32(x1)/scale, f32(y1)/scale, 1.0); - let f_p1 = parameters.fundamental_matrix*p1; - if abs(f_p1.x)>abs(f_p1.y) { - (*coeff).x = f32(-f_p1.y/f_p1.x); - (*add_const).x = f32(-scale*f_p1.z/f_p1.x); - (*coeff).y = 1.0; - (*add_const).y = 0.0; - } else { - (*coeff).x = 1.0; - (*add_const).x = 0.0; - (*coeff).y = f32(-f_p1.x/f_p1.y); - (*add_const).y = f32(-scale*f_p1.z/f_p1.y); - } -} - -@compute @workgroup_size(16, 16, 1) -fn prepare_searchdata(@builtin(global_invocation_id) global_id: vec3) { - let x = global_id.x; - let y = global_id.y; - let x_signed = i32(global_id.x); - let y_signed = i32(global_id.y); - - let img1_width = parameters.img1_width; - let img1_height = parameters.img1_height; - let img2_width = parameters.img2_width; - let img2_height = parameters.img2_height; - let out_width = parameters.out_width; - let out_height = parameters.out_height; - let scale = parameters.scale; - let first_pass = parameters.iteration_pass == 0u; - let corridor_start = i32(parameters.corridor_start); - let corridor_end = i32(parameters.corridor_end); - let kernel_size = parameters.kernel_size; - let neighbor_distance = i32(parameters.neighbor_distance); - let extend_range = parameters.extend_range; - let min_range = parameters.min_range; - - let out_neighbor_width = i32(ceil(f32(parameters.neighbor_distance)/scale))*2+1; - - var start_pos_x = i32(floor(f32(x_signed-neighbor_distance)/scale)); - var start_pos_y = i32(floor(f32(y_signed-neighbor_distance)/scale)); - - var data_int = internals_int[y*img1_width+x]; - var neighbor_count = data_int[2]; - if !first_pass && neighbor_count<=0 { - return; - } - - var data = internals_img1[y*img1_width+x]; - var mid_corridor = data[0]; - var range_stdev = data[1]; - if !first_pass { - mid_corridor /= f32(neighbor_count); - } - - var coeff: vec2; - var add_const: vec2; - calculate_epipolar_line(x, y, &coeff, &add_const); - let corridor_vertical = abs(coeff.y) > abs(coeff.x); - for (var i: i32=corridor_start;i=i32(out_width) || y_out>=i32(out_height) { - continue; - } - - let coord2 = vec2(result_matches[u32(y_out)*out_width + u32(x_out)]) * scale; - if coord2.x<0.0 || coord2.y<0.0 { - continue; - } - - var corridor_pos: f32; - if corridor_vertical { - corridor_pos = round((coord2.y-add_const.y)/coeff.y); - } else { - corridor_pos = round((coord2.x-add_const.x)/coeff.x); - } - if first_pass { - mid_corridor += corridor_pos; - neighbor_count++; - } else { - let delta = corridor_pos-mid_corridor; - range_stdev += delta*delta; - } - } - - if first_pass { - data[0] = mid_corridor; - internals_img1[y*img1_width+x] = data; - data_int[2] = neighbor_count; - internals_int[y*img1_width+x] = data_int; - return; - } - - data[1] = range_stdev; - internals_img1[y*img1_width+x] = data; - range_stdev = sqrt(range_stdev/f32(neighbor_count)); - let corridor_center = i32(round(mid_corridor)); - let corridor_length = i32(round(min_range+range_stdev*extend_range)); - var min_pos = i32(kernel_size); - var max_pos: i32; - if corridor_vertical { - max_pos = i32(img2_height-kernel_size); - } else { - max_pos = i32(img2_width-kernel_size); - } - min_pos = clamp(corridor_center - corridor_length, min_pos, max_pos); - max_pos = clamp(corridor_center + corridor_length, min_pos, max_pos); - internals_int[y*img1_width+x] = vec3(min_pos, max_pos, neighbor_count); -} - -@compute @workgroup_size(16, 16, 1) -fn cross_correlate(@builtin(global_invocation_id) global_id: vec3) { - let x1 = global_id.x; - let y1 = global_id.y; - - let img1_width = parameters.img1_width; - let img1_height = parameters.img1_height; - let img2_width = parameters.img2_width; - let img2_height = parameters.img2_height; - let out_width = parameters.out_width; - let out_height = parameters.out_height; - let scale = parameters.scale; - let first_iteration = parameters.iteration_pass == 0u; - let corridor_offset = parameters.corridor_offset; - let kernel_size = parameters.kernel_size; - let threshold = parameters.threshold; - let min_stdev = parameters.min_stdev; - let kernel_width = kernel_size*2u+1u; - let kernel_point_count = f32(kernel_width*kernel_width); - - if x1 < kernel_size || x1 >= img1_width-kernel_size || y1 < kernel_size || y1 >= img1_height-kernel_size { - return; - } - - let img1_offset = 0u; - let img2_offset = img1_width*img1_height; - - var data_img1 = internals_img1[img1_width*y1+x1]; - let avg1 = data_img1[0]; - let stdev1 = data_img1[1]; - if stdev1 < min_stdev { - return; - } - - var best_corr = 0.0; - var best_match = vec2(-1, -1); - - var corridor_start = kernel_size + parameters.corridor_start; - var corridor_end = kernel_size + parameters.corridor_end; - if !first_iteration { - let data_int = internals_int[img1_width*y1 + x1]; - let min_pos_signed = data_int[0]; - let max_pos_signed = data_int[1]; - if min_pos_signed<0 || max_pos_signed<0 { - return; - } - let min_pos = u32(min_pos_signed); - let max_pos = u32(max_pos_signed); - corridor_start = clamp(min_pos+parameters.corridor_start, min_pos, max_pos); - corridor_end = clamp(min_pos+parameters.corridor_end, min_pos, max_pos); - if corridor_start >= corridor_end { - return; - } - } - - var coeff: vec2; - var add_const: vec2; - calculate_epipolar_line(x1, y1, &coeff, &add_const); - var corridor_offset_vector: vec2; - if abs(coeff.x) > abs(coeff.y) { - corridor_offset_vector = vec2(0, corridor_offset); - } else { - corridor_offset_vector = vec2(corridor_offset, 0); - } - for (var corridor_pos: u32=corridor_start;corridor_pos= img2_width-kernel_size || y2 < kernel_size || y2 >= img2_height-kernel_size { - continue; - } - - let data_img2 = internals_img2[img2_width*y2 + x2]; - let avg2 = data_img2[0]; - let stdev2 = data_img2[1]; - if stdev2 < min_stdev { - continue; - } - - var corr = 0.0; - for (var j=0u;j= threshold && corr > best_corr { - best_match.x = i32(round(f32(x2)/scale)); - best_match.y = i32(round(f32(y2)/scale)); - best_corr = corr; - } - } - - let out_pos = out_width*u32((f32(y1)/scale)) + u32(f32(x1)/scale); - let current_corr = result_corr[out_pos]; - if (best_corr >= threshold && best_corr > current_corr) - { - result_matches[out_pos] = best_match; - result_corr[out_pos] = best_corr; - } -} - -@group(1) @binding(0) var img1: array>; -@group(1) @binding(1) var img2: array>; - -@compute @workgroup_size(16, 16, 1) -fn cross_check_filter(@builtin(global_invocation_id) global_id: vec3) { - let x1 = global_id.x; - let y1 = global_id.y; - - let img1_width = parameters.img1_width; - let img1_height = parameters.img1_height; - let img2_width = parameters.img2_width; - let img2_height = parameters.img2_height; - let search_area = i32(parameters.neighbor_distance); - - if x1 >= img1_width || y1 >= img1_height { - return; - } - - let point = img1[img1_width*y1+x1]; - if point.x < 0 || point.y < 0 || u32(point.x) >= img2_width || u32(point.y) >= img2_height { - return; - } - - let min_x = u32(clamp(point.x-search_area, 0, i32(img2_width))); - let max_x = u32(clamp(point.x+search_area+1, 0, i32(img2_width))); - let min_y = u32(clamp(point.y-search_area, 0, i32(img2_height))); - let max_y = u32(clamp(point.y+search_area+1, 0, i32(img2_height))); - - let r_min_x = clamp(i32(x1)-search_area, 0, i32(img1_width)); - let r_max_x = clamp(i32(x1)+search_area+1, 0, i32(img1_width)); - let r_min_y = clamp(i32(y1)-search_area, 0, i32(img1_height)); - let r_max_y = clamp(i32(y1)+search_area+1, 0, i32(img1_height)); - - for (var y2 = min_y;y2= r_min_x && rpoint.x < r_max_x && rpoint.y >= r_min_y && rpoint.y < r_max_y { - return; - } - } - } - - img1[img1_width*y1+x1] = vec2(-1, -1); -} diff --git a/src/correlation/gpu/metal.rs b/src/correlation/gpu/metal.rs index 1b68c11..d88bea4 100644 --- a/src/correlation/gpu/metal.rs +++ b/src/correlation/gpu/metal.rs @@ -1,63 +1,452 @@ +use std::{collections::HashMap, error, ffi::c_void, fs, slice}; + +use metal::objc::rc::autoreleasepool; use nalgebra::Matrix3; -use std::{error, fmt}; +use rayon::iter::ParallelIterator; -use crate::data::Grid; +use crate::{ + correlation::{gpu::ShaderParams, Match}, + data::{Grid, Point2D}, +}; -use super::{CorrelationDirection, ProjectionMode}; +use super::{CorrelationDirection, GpuError, HardwareMode}; -pub struct GpuContext {} +// This is optimized to work with built-in Apple Silicon GPUs. +// AMD or built-in Intel GPUs will likely underperform. +// AMD64 devices are no longer available for sale, and testing/validating would require too much +// effort. +pub struct Device { + device: metal::Device, + buffers: Option, + direction: CorrelationDirection, + unified_memory: bool, + pipelines: HashMap, + command_queue: metal::CommandQueue, +} -impl GpuContext { - pub fn new( - _: (usize, usize), - _: (usize, usize), - _: ProjectionMode, - _: Matrix3, - _: bool, - ) -> Result> { - Err(GpuError::new("Compiled without GPU support").into()) +enum BufferType { + GpuOnly, + HostVisible, +} + +struct DeviceBuffers { + buffer_img: metal::Buffer, + buffer_internal_img1: metal::Buffer, + buffer_internal_img2: metal::Buffer, + buffer_internal_int: metal::Buffer, + buffer_out: metal::Buffer, + buffer_out_reverse: metal::Buffer, + buffer_out_corr: metal::Buffer, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum ShaderModuleType { + InitOutData, + PrepareInitialdataSearchdata, + PrepareInitialdataCorrelation, + PrepareSearchdata, + CrossCorrelate, + CrossCheckFilter, +} + +pub struct DeviceContext { + low_power: bool, + device: Device, +} + +impl DeviceContext { + pub fn new(hardware_mode: HardwareMode) -> Result> { + autoreleasepool(|| { + if !matches!(hardware_mode, HardwareMode::Gpu | HardwareMode::GpuLowPower) { + return Err(GpuError::new("GPU mode is not enabled").into()); + }; + let low_power = matches!(hardware_mode, HardwareMode::GpuLowPower); + let device = Device::new()?; + Ok(DeviceContext { low_power, device }) + }) } +} - pub fn get_device_name(&self) -> &'static str { - "undefined" +impl super::DeviceContext for DeviceContext { + fn convert_fundamental_matrix(fundamental_matrix: &Matrix3) -> [f32; 3 * 4] { + let mut f = [0f32; 3 * 4]; + for row in 0..3 { + for col in 0..3 { + f[row * 4 + col] = fundamental_matrix[(row, col)] as f32; + } + } + f } - pub fn cross_check_filter(&mut self, _: f32, _: CorrelationDirection) {} + fn is_low_power(&self) -> bool { + self.low_power + } - pub fn complete_process( - &mut self, - ) -> Result>, Box> { - Err(GpuError::new("Compiled without GPU support").into()) + fn get_device_name(&self) -> Option { + autoreleasepool(|| Some(self.device.device.name().into())) } - pub fn correlate_images( + fn prepare_device( &mut self, - _: &Grid, - _: &Grid, - _: f32, - _: bool, - _: Option<&PL>, - _: CorrelationDirection, + img1_dimensions: (usize, usize), + img2_dimensions: (usize, usize), ) -> Result<(), Box> { - Err(GpuError::new("Compiled without GPU support").into()) + let img1_pixels = img1_dimensions.0 * img1_dimensions.1; + let img2_pixels = img2_dimensions.0 * img2_dimensions.1; + + autoreleasepool(|| { + let device = &mut self.device; + device.buffers = None; + let buffers = unsafe { device.create_buffers(img1_pixels, img2_pixels)? }; + device.buffers = Some(buffers); + device.direction = CorrelationDirection::Forward; + Ok(()) + }) + } + + fn device(&self) -> Result<&Device, GpuError> { + Ok(&self.device) + } + + fn device_mut(&mut self) -> Result<&mut Device, GpuError> { + Ok(&mut self.device) } } -#[derive(Debug)] -pub struct GpuError { - msg: &'static str, +impl Device { + fn new() -> Result> { + autoreleasepool(|| { + let device = match metal::Device::system_default() { + Some(device) => device, + None => return Err(GpuError::new("GPU mode is not enabled").into()), + }; + let direction = CorrelationDirection::Forward; + let unified_memory = device.has_unified_memory(); + let pipelines = Device::create_pipelines(&device)?; + let command_queue = device.new_command_queue(); + Ok(Device { + device, + buffers: None, + direction, + unified_memory, + pipelines, + command_queue, + }) + }) + } + + unsafe fn create_buffers( + &self, + img1_pixels: usize, + img2_pixels: usize, + ) -> Result> { + let max_pixels = img1_pixels.max(img2_pixels); + let buffer_img = self.create_buffer( + (img1_pixels + img2_pixels) * std::mem::size_of::(), + BufferType::HostVisible, + ); + + let buffer_internal_img1 = self.create_buffer( + (img1_pixels * 2) * std::mem::size_of::(), + BufferType::GpuOnly, + ); + + let buffer_internal_img2 = self.create_buffer( + (img2_pixels * 2) * std::mem::size_of::(), + BufferType::GpuOnly, + ); + + let buffer_internal_int = self.create_buffer( + max_pixels * 4 * std::mem::size_of::(), + BufferType::GpuOnly, + ); + + let buffer_out = self.create_buffer( + img1_pixels * 2 * std::mem::size_of::(), + BufferType::HostVisible, + ); + + let buffer_out_reverse = self.create_buffer( + img2_pixels * 2 * std::mem::size_of::(), + BufferType::GpuOnly, + ); + + let buffer_out_corr = self.create_buffer( + max_pixels * std::mem::size_of::(), + BufferType::HostVisible, + ); + + Ok(DeviceBuffers { + buffer_img, + buffer_internal_img1, + buffer_internal_img2, + buffer_internal_int, + buffer_out, + buffer_out_reverse, + buffer_out_corr, + }) + } + + fn buffers(&self) -> Result<&DeviceBuffers, GpuError> { + match self.buffers.as_ref() { + Some(buffers) => Ok(buffers), + None => Err(GpuError::new("Buffers not initialized")), + } + } + + unsafe fn create_buffer(&self, size: usize, buffer_type: BufferType) -> metal::Buffer { + let size = size as u64; + let options = match buffer_type { + BufferType::GpuOnly => metal::MTLResourceOptions::StorageModePrivate, + BufferType::HostVisible => { + if self.unified_memory { + metal::MTLResourceOptions::StorageModeShared + } else { + metal::MTLResourceOptions::StorageModeManaged + } + } + }; + autoreleasepool(|| self.device.new_buffer(size, options)) + } + + fn flush_buffer_to_device(&self, buffer: &metal::Buffer, size: usize) { + if !self.unified_memory { + let range = metal::NSRange { + location: 0, + length: size as u64, + }; + buffer.did_modify_range(range); + } + } + + fn flush_buffer_to_host(&self, buffer: &metal::Buffer) { + if !&self.unified_memory { + let command_buffer = self.command_queue.new_command_buffer(); + let blit_encoder = command_buffer.new_blit_command_encoder(); + blit_encoder.synchronize_resource(buffer); + blit_encoder.end_encoding(); + command_buffer.commit(); + command_buffer.wait_until_completed(); + } + } + + fn create_pipelines( + device: &metal::Device, + ) -> Result, Box> { + autoreleasepool(|| { + let mut result = HashMap::new(); + // TODO: switch to a precompiled library and use new_library_with_data + let source = fs::read_to_string("correlation.metal")?; + let library = + device.new_library_with_source(source.as_str(), &metal::CompileOptions::new())?; + for module_type in ShaderModuleType::VALUES { + let function = module_type.load(&library)?; + let pipeline = device.new_compute_pipeline_state_with_function(&function)?; + result.insert(module_type, pipeline); + } + Ok(result) + }) + } + + fn set_buffer_layout( + &self, + shader: &ShaderModuleType, + command_encoder: &metal::ComputeCommandEncoderRef, + ) -> Result<(), GpuError> { + let buffers = &self.buffers()?; + let (buffer_internal_img1, buffer_internal_img2, buffer_out, buffer_out_reverse) = + match self.direction { + CorrelationDirection::Forward => ( + &buffers.buffer_internal_img1, + &buffers.buffer_internal_img2, + &buffers.buffer_out, + &buffers.buffer_out_reverse, + ), + CorrelationDirection::Reverse => ( + &buffers.buffer_internal_img2, + &buffers.buffer_internal_img1, + &buffers.buffer_out_reverse, + &buffers.buffer_out, + ), + }; + // Index 0 is reserved for push_constants. + if matches!(shader, ShaderModuleType::CrossCheckFilter) { + command_encoder.set_buffer(1, Some(buffer_out), 0); + command_encoder.set_buffer(2, Some(buffer_out_reverse), 0); + return Ok(()); + } + + command_encoder.set_buffer(1, Some(&buffers.buffer_img), 0); + command_encoder.set_buffer(2, Some(buffer_internal_img1), 0); + command_encoder.set_buffer(3, Some(buffer_internal_img2), 0); + command_encoder.set_buffer(4, Some(&buffers.buffer_internal_int), 0); + command_encoder.set_buffer(5, Some(buffer_out), 0); + command_encoder.set_buffer(6, Some(&buffers.buffer_out_corr), 0); + Ok(()) + } } -impl GpuError { - fn new(msg: &'static str) -> GpuError { - GpuError { msg } +impl super::Device for Device { + fn set_buffer_direction(&mut self, direction: &CorrelationDirection) -> Result<(), GpuError> { + self.direction = direction.to_owned(); + Ok(()) + } + + unsafe fn run_shader( + &mut self, + dimensions: (usize, usize), + shader_type: ShaderModuleType, + shader_params: ShaderParams, + ) -> Result<(), Box> { + let workgroup_size = ((dimensions.0 + 15) / 16, ((dimensions.1 + 15) / 16)); + autoreleasepool(|| { + let pipeline = self.pipelines.get(&shader_type).unwrap(); + let command_buffer = self.command_queue.new_command_buffer(); + + let compute_encoder = command_buffer.new_compute_command_encoder(); + compute_encoder.set_compute_pipeline_state(pipeline); + + let push_constants_data = slice::from_raw_parts( + &shader_params as *const ShaderParams as *const u8, + std::mem::size_of::(), + ); + compute_encoder.set_bytes( + 0, + push_constants_data.len() as u64, + push_constants_data.as_ptr() as *const c_void, + ); + self.set_buffer_layout(&shader_type, compute_encoder)?; + + let thread_group_count = metal::MTLSize { + width: workgroup_size.0 as u64, + height: workgroup_size.1 as u64, + depth: 1, + }; + let thread_group_size = metal::MTLSize { + width: 16, + height: 16, + depth: 1, + }; + + compute_encoder.dispatch_thread_groups(thread_group_count, thread_group_size); + compute_encoder.end_encoding(); + + command_buffer.commit(); + command_buffer.wait_until_completed(); + + Ok(()) + }) + } + + unsafe fn transfer_in_images( + &self, + img1: &Grid, + img2: &Grid, + ) -> Result<(), Box> { + autoreleasepool(|| { + let buffers = self.buffers()?; + let buffer = &buffers.buffer_img; + let buffer_contents = buffer.contents(); + let img2_offset = img1.width() * img1.height(); + let size = img1.width() * img1.height() + img2.width() * img2.height(); + let img_slice = slice::from_raw_parts_mut(buffer_contents as *mut f32, size); + img1.iter() + .for_each(|(x, y, val)| img_slice[y * img1.width() + x] = *val as f32); + img2.iter().for_each(|(x, y, val)| { + img_slice[img2_offset + y * img2.width() + x] = *val as f32 + }); + + self.flush_buffer_to_device(buffer, std::mem::size_of_val(img_slice)); + Ok(()) + }) + } + + unsafe fn save_corr( + &self, + correlation_values: &mut Grid>, + correlation_threshold: f32, + ) -> Result<(), Box> { + autoreleasepool(|| { + let buffers = self.buffers()?; + let buffer = &buffers.buffer_out_corr; + self.flush_buffer_to_host(buffer); + let buffer_contents = buffer.contents(); + let size = correlation_values.width() * correlation_values.height(); + let width = correlation_values.width(); + let out_corr = slice::from_raw_parts_mut(buffer_contents as *mut f32, size); + correlation_values + .par_iter_mut() + .for_each(|(x, y, out_point)| { + let corr = out_corr[y * width + x]; + if corr > correlation_threshold { + *out_point = Some(corr); + } + }); + + Ok(()) + }) + } + + unsafe fn save_result( + &self, + out_image: &mut Grid>, + correlation_values: &Grid>, + ) -> Result<(), Box> { + autoreleasepool(|| { + let buffers = self.buffers()?; + let buffer = &buffers.buffer_out; + self.flush_buffer_to_host(buffer); + let buffer_contents = buffer.contents(); + let size = out_image.width() * out_image.height() * 2; + let width = out_image.width(); + let out_data = slice::from_raw_parts_mut(buffer_contents as *mut i32, size); + out_image.par_iter_mut().for_each(|(x, y, out_point)| { + let pos = 2 * (y * width + x); + let (match_x, match_y) = (out_data[pos], out_data[pos + 1]); + if let Some(corr) = correlation_values.val(x, y) { + *out_point = if match_x > 0 && match_y > 0 { + let point_match = Point2D::new(match_x as u32, match_y as u32); + Some((point_match, *corr)) + } else { + None + }; + } else { + *out_point = None; + }; + }); + + Ok(()) + }) + } + + unsafe fn destroy_buffers(&mut self) { + autoreleasepool(|| { + self.buffers = None; + }) } } -impl std::error::Error for GpuError {} +impl ShaderModuleType { + const VALUES: [Self; 6] = [ + Self::InitOutData, + Self::PrepareInitialdataSearchdata, + Self::PrepareInitialdataCorrelation, + Self::PrepareSearchdata, + Self::CrossCorrelate, + Self::CrossCheckFilter, + ]; + + fn load(&self, library: &metal::Library) -> Result> { + let function_name = match self { + ShaderModuleType::InitOutData => "init_out_data", + ShaderModuleType::PrepareInitialdataSearchdata => "prepare_initialdata_searchdata", + ShaderModuleType::PrepareInitialdataCorrelation => "prepare_initialdata_correlation", + ShaderModuleType::PrepareSearchdata => "prepare_searchdata", + ShaderModuleType::CrossCorrelate => "cross_correlate", + ShaderModuleType::CrossCheckFilter => "cross_check_filter", + }; + let function = library.get_function(function_name, None)?; -impl fmt::Display for GpuError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}", self.msg) + Ok(function) } } diff --git a/src/correlation/gpu/mod.rs b/src/correlation/gpu/mod.rs index ca47f18..c9bae20 100644 --- a/src/correlation/gpu/mod.rs +++ b/src/correlation/gpu/mod.rs @@ -3,15 +3,19 @@ mod metal; #[cfg(not(target_os = "macos"))] mod vulkan; -use std::{error, fmt}; +#[cfg(target_os = "macos")] +use metal::ShaderModuleType; #[cfg(not(target_os = "macos"))] use vulkan::ShaderModuleType; +#[cfg(target_os = "macos")] +pub type DefaultDeviceContext = metal::DeviceContext; #[cfg(not(target_os = "macos"))] pub type DefaultDeviceContext = vulkan::DeviceContext; use crate::data::Grid; use nalgebra::Matrix3; +use std::{error, fmt}; use crate::correlation::{ CorrelationDirection, CorrelationParameters, HardwareMode, ProjectionMode, CORRIDOR_MIN_RANGE, @@ -27,6 +31,8 @@ const CORRIDOR_SEGMENT_LENGTH_LOWPOWER: usize = 8; const SEARCH_AREA_SEGMENT_LENGTH_LOWPOWER: usize = 128; trait Device { + fn set_buffer_direction(&mut self, direction: &CorrelationDirection) -> Result<(), GpuError>; + unsafe fn run_shader( &mut self, dimensions: (usize, usize), @@ -59,6 +65,8 @@ trait DeviceContext where D: Device, { + fn convert_fundamental_matrix(fundamental_matrix: &Matrix3) -> [f32; 3 * 4]; + fn is_low_power(&self) -> bool; fn get_device_name(&self) -> Option; @@ -224,7 +232,7 @@ impl GpuContext<'_> { dir: CorrelationDirection, ) -> Result<(), Box> { { - let device = self.device_context.device()?; + let device = self.device_context.device_mut()?; device.set_buffer_direction(&dir)?; } let max_width = img1.width().max(img2.width()); @@ -385,17 +393,7 @@ impl GpuContext<'_> { CorrelationDirection::Forward => self.fundamental_matrix, CorrelationDirection::Reverse => self.fundamental_matrix.transpose(), }; - let mut f = [0f32; 3 * 4]; - // Matrix layout in GLSL (OpenGL) is pure madness: https://www.opengl.org/archives/resources/faq/technical/transformations.htm. - // "Column major" means that vectors are vertical and a matrix multiplies a vector. - // "Row major" means a horizontal vector multiplies a matrix. - // This says nothing about how the matrix is stored in memory. - for row in 0..3 { - for col in 0..3 { - f[col * 4 + row] = fundamental_matrix[(row, col)] as f32; - } - } - f + DefaultDeviceContext::convert_fundamental_matrix(&fundamental_matrix) } } diff --git a/src/correlation/gpu/shaders/correlation.metal b/src/correlation/gpu/shaders/correlation.metal index 5f61074..99c0fb9 100644 --- a/src/correlation/gpu/shaders/correlation.metal +++ b/src/correlation/gpu/shaders/correlation.metal @@ -1,3 +1,392 @@ -kernel void add_arrays(uint index [[thread_position_in_grid]]) +#include + +using namespace metal; + +struct Parameters { + uint img1_width; + uint img1_height; + uint img2_width; + uint img2_height; + uint out_width; + uint out_height; + float scale; + uint iteration_pass; + float3x3 fundamental_matrix; + int corridor_offset; + uint corridor_start; + uint corridor_end; + uint kernel_size; + float threshold; + float min_stdev; + uint neighbor_distance; + float extend_range; + float min_range; +}; + +kernel void init_out_data(const device Parameters& p [[buffer(0)]], device int2* result_matches[[buffer(5)]], device float* result_corr[[buffer(6)]], uint2 index [[thread_position_in_grid]]) { + const uint x = index.x; + const uint y = index.y; + + const uint out_width = p.out_width; + const uint out_height = p.out_height; + + if (x < out_width && y < out_height) { + result_matches[out_width*y+x] = int2(-1, -1); + result_corr[out_width*y+x] = -1.0; + } +} + +kernel void prepare_initialdata_searchdata(const device Parameters& p [[buffer(0)]], device float2* internals_img1[[buffer(2)]], device int3* internals_int[[buffer(4)]], device float* result_corr[[buffer(6)]], uint2 index [[thread_position_in_grid]]) { + const uint x = index.x; + const uint y = index.y; + + const uint out_width = p.out_width; + const uint out_height = p.out_height; + const uint img1_width = p.img1_width; + const uint img1_height = p.img1_height; + + if (x < img1_width && y < img1_height) { + internals_img1[img1_width*y+x] = float2(0.0, 0.0); + internals_int[img1_width*y+x] = int3(-1, -1, 0); + } + if (x < out_width && y < out_height) { + result_corr[out_width*y+x] = -1.0; + } +} + +kernel void prepare_initialdata_correlation(const device Parameters& p [[buffer(0)]], const device float* images[[buffer(1)]], device float2* internals_img1[[buffer(2)]], device float2* internals_img2[[buffer(3)]], uint2 index [[thread_position_in_grid]]) { + const uint x = index.x; + const uint y = index.y; + + const uint img1_width = p.img1_width; + const uint img1_height = p.img1_height; + const uint img2_width = p.img2_width; + const uint img2_height = p.img2_height; + const uint kernel_size = p.kernel_size; + const uint kernel_width = kernel_size*2+1; + const float kernel_point_count = float(kernel_width*kernel_width); + + const uint img1_offset = 0; + const uint img2_offset = img1_width*img1_height; + + if (x >= kernel_size && x < img1_width-kernel_size && y >= kernel_size && y < img1_height-kernel_size) { + float avg = 0.0; + float stdev = 0.0; + + for (uint j=0;j= kernel_size && x < img2_width-kernel_size && y >= kernel_size && y < img2_height-kernel_size) { + float avg = 0.0; + float stdev = 0.0; + + for (uint j=0;jabs(f_p1.y)) { + coeff = float2(-f_p1.y/f_p1.x, 1.0); + add_const = float2(-scale*f_p1.z/f_p1.x, 0.0); + } else { + coeff = float2(1.0, -f_p1.x/f_p1.y); + add_const = float2(0.0, -scale*f_p1.z/f_p1.y); + } +} + +kernel void prepare_searchdata(const device Parameters& p [[buffer(0)]], device float2* internals_img1[[buffer(2)]], device int3* internals_int[[buffer(4)]], const device int2* result_matches[[buffer(5)]], uint2 index [[thread_position_in_grid]]) { + const uint x = index.x; + const uint y = index.y; + + const uint img1_width = p.img1_width; + const uint img1_height = p.img1_height; + + if (x >= img1_width || y >= img1_height) { + return; + } + + const uint img2_width = p.img2_width; + const uint img2_height = p.img2_height; + const uint out_width = p.out_width; + const uint out_height = p.out_height; + const float scale = p.scale; + const uint kernel_size = p.kernel_size; + const uint kernel_width = kernel_size*2+1; + + const int x_signed = int(x); + const int y_signed = int(y); + + const bool first_pass = p.iteration_pass == 0; + const int neighbor_distance_signed = int(p.neighbor_distance); + const float extend_range = p.extend_range; + const float min_range = p.min_range; + + const int corridor_start_signed = int(p.corridor_start); + const int corridor_end_signed = int(p.corridor_end); + const int out_neighbor_width = int(ceil(float(p.neighbor_distance)/scale))*2+1; + + const int start_pos_x = int(floor(float(x_signed-neighbor_distance_signed)/scale)); + const int start_pos_y = int(floor(float(y_signed-neighbor_distance_signed)/scale)); + + int3 data_int = internals_int[y*img1_width+x]; + int neighbor_count = data_int[2]; + if (!first_pass && neighbor_count<=0) { + return; + } + + float2 data = internals_img1[y*img1_width+x]; + float mid_corridor = data[0]; + float range_stdev = data[1]; + if (!first_pass) { + mid_corridor /= float(neighbor_count); + } + + float2 coeff; + float2 add_const; + calculate_epipolar_line(p, x, y, coeff, add_const); + const bool corridor_vertical = abs(coeff.y) > abs(coeff.x); + for (int i=corridor_start_signed;i=int(out_width) || y_out>=int(out_height)) { + continue; + } + + float2 coord2 = float2(result_matches[uint(y_out)*out_width + uint(x_out)]) * scale; + if (coord2.x<0.0 || coord2.y<0.0) { + continue; + } + + float corridor_pos; + if (corridor_vertical) { + corridor_pos = round((coord2.y-add_const.y)/coeff.y); + } else { + corridor_pos = round((coord2.x-add_const.x)/coeff.x); + } + if (first_pass) { + mid_corridor += corridor_pos; + neighbor_count++; + } else { + float delta = corridor_pos-mid_corridor; + range_stdev += delta*delta; + } + } + + if (first_pass) { + data[0] = mid_corridor; + internals_img1[y*img1_width+x] = data; + data_int[2] = neighbor_count; + internals_int[y*img1_width+x] = data_int; + return; + } + + data[1] = range_stdev; + internals_img1[y*img1_width+x] = data; + range_stdev = sqrt(range_stdev/float(neighbor_count)); + const int corridor_center = int(round(mid_corridor)); + const int corridor_length = int(round(min_range+range_stdev*extend_range)); + int min_pos = int(kernel_size); + int max_pos; + if (corridor_vertical) { + max_pos = int(img2_height-kernel_size); + } else { + max_pos = int(img2_width-kernel_size); + } + min_pos = clamp(corridor_center - corridor_length, min_pos, max_pos); + max_pos = clamp(corridor_center + corridor_length, min_pos, max_pos); + internals_int[y*img1_width+x] = int3(min_pos, max_pos, neighbor_count); +} + +kernel void cross_correlate(const device Parameters& p [[buffer(0)]], const device float* images[[buffer(1)]], const device float2* internals_img1[[buffer(2)]], const device float2* internals_img2[[buffer(3)]], const device int3* internals_int[[buffer(4)]], device int2* result_matches[[buffer(5)]], device float* result_corr[[buffer(6)]], uint2 index [[thread_position_in_grid]]) { + const uint x1 = index.x; + const uint y1 = index.y; + + const uint img1_width = p.img1_width; + const uint img1_height = p.img1_height; + const uint kernel_size = p.kernel_size; + + if (x1 < kernel_size || x1 >= img1_width-kernel_size || y1 < kernel_size || y1 >= img1_height-kernel_size) { + return; + } + + const uint img2_width = p.img2_width; + const uint img2_height = p.img2_height; + const uint out_width = p.out_width; + const float scale = p.scale; + const bool first_iteration = p.iteration_pass == 0; + const int corridor_offset = p.corridor_offset; + const float threshold = p.threshold; + const float min_stdev = p.min_stdev; + const uint kernel_width = kernel_size*2+1; + const float kernel_point_count = float(kernel_width*kernel_width); + + const uint img1_offset = 0; + const uint img2_offset = img1_width*img1_height; + + const float2 data_img1 = internals_img1[img1_width*y1+x1]; + const float avg1 = data_img1[0]; + const float stdev1 = data_img1[1]; + if (stdev1 < min_stdev) { + return; + } + + float best_corr = 0.0; + int2 best_match = int2(-1, -1); + + uint corridor_start = kernel_size + p.corridor_start; + uint corridor_end = kernel_size + p.corridor_end; + if (!first_iteration) { + const int3 data_int = internals_int[img1_width*y1 + x1]; + int min_pos_signed = data_int[0]; + int max_pos_signed = data_int[1]; + if (min_pos_signed<0 || max_pos_signed<0) { + return; + } + const uint min_pos = uint(min_pos_signed); + const uint max_pos = uint(max_pos_signed); + corridor_start = clamp(min_pos+p.corridor_start, min_pos, max_pos); + corridor_end = clamp(min_pos+p.corridor_end, min_pos, max_pos); + if (corridor_start >= corridor_end) { + return; + } + } + + float2 coeff; + float2 add_const; + calculate_epipolar_line(p, x1, y1, coeff, add_const); + int2 corridor_offset_vector; + if (abs(add_const.x) > abs(add_const.y)) { + corridor_offset_vector = int2(corridor_offset, 0); + } else { + corridor_offset_vector = int2(0, corridor_offset); + } + for (uint corridor_pos=corridor_start;corridor_pos= img2_width-kernel_size || y2 < kernel_size || y2 >= img2_height-kernel_size) { + continue; + } + + const float2 data_img2 = internals_img2[img2_width*y2 + x2]; + const float avg2 = data_img2[0]; + const float stdev2 = data_img2[1]; + if (stdev2 < min_stdev) { + continue; + } + + float corr = 0.0; + for (uint j=0;j= threshold && corr > best_corr) { + best_match.x = int(round(float(x2)/scale)); + best_match.y = int(round(float(y2)/scale)); + best_corr = corr; + } + } + + uint out_pos = out_width*uint((float(y1)/scale)) + uint(float(x1)/scale); + float current_corr = result_corr[out_pos]; + if (best_corr >= threshold && best_corr > current_corr) + { + result_matches[out_pos] = best_match; + result_corr[out_pos] = best_corr; + } +} + +kernel void cross_check_filter(const device Parameters& p [[buffer(0)]], device int2* img1[[buffer(1)]], const device int2* img2[[buffer(2)]], uint2 index [[thread_position_in_grid]]) { + const uint x1 = index.x; + const uint y1 = index.y; + + const uint img1_width = p.img1_width; + const uint img1_height = p.img1_height; + + if (x1 >= img1_width || y1 >= img1_height) { + return; + } + + const uint img2_width = p.img2_width; + const uint img2_height = p.img2_height; + const int search_area = int(p.neighbor_distance); + + const int2 point = img1[img1_width*y1+x1]; + if (point.x < 0 || point.y < 0 || uint(point.x) >= img2_width || uint(point.y) >= img2_height) { + return; + } + + const uint min_x = uint(clamp(point.x-search_area, 0, int(img2_width))); + const uint max_x = uint(clamp(point.x+search_area+1, 0, int(img2_width))); + const uint min_y = uint(clamp(point.y-search_area, 0, int(img2_height))); + const uint max_y = uint(clamp(point.y+search_area+1, 0, int(img2_height))); + + const int r_min_x = clamp(int(x1)-search_area, 0, int(img1_width)); + const int r_max_x = clamp(int(x1)+search_area+1, 0, int(img1_width)); + const int r_min_y = clamp(int(y1)-search_area, 0, int(img1_height)); + const int r_max_y = clamp(int(y1)+search_area+1, 0, int(img1_height)); + + for (uint y2=min_y;y2= r_min_x && rpoint.x < r_max_x && rpoint.y >= r_min_y && rpoint.y < r_max_y) { + return; + } + } + } + + img1[img1_width*y1+x1] = int2(-1, -1); } diff --git a/src/correlation/gpu/vulkan.rs b/src/correlation/gpu/vulkan.rs index c6d32b6..811ef2f 100644 --- a/src/correlation/gpu/vulkan.rs +++ b/src/correlation/gpu/vulkan.rs @@ -1,6 +1,8 @@ use std::{cmp::Ordering, collections::HashMap, error, ffi::CStr, slice}; +use super::Device as GpuDevice; use ash::{prelude::VkResult, vk}; +use nalgebra::Matrix3; use rayon::iter::ParallelIterator; use crate::{ @@ -113,6 +115,20 @@ impl DeviceContext { } impl super::DeviceContext for DeviceContext { + fn convert_fundamental_matrix(fundamental_matrix: &Matrix3) -> [f32; 3 * 4] { + let mut f = [0f32; 3 * 4]; + // Matrix layout in GLSL (OpenGL) is pure madness: https://www.opengl.org/archives/resources/faq/technical/transformations.htm. + // "Column major" means that vectors are vertical and a matrix multiplies a vector. + // "Row major" means a horizontal vector multiplies a matrix. + // This says nothing about how the matrix is stored in memory. + for row in 0..3 { + for col in 0..3 { + f[col * 4 + row] = fundamental_matrix[(row, col)] as f32; + } + } + f + } + fn is_low_power(&self) -> bool { self.low_power } @@ -765,63 +781,6 @@ impl Device { }) } - pub fn set_buffer_direction(&self, direction: &CorrelationDirection) -> Result<(), GpuError> { - let descriptor_sets = &self.descriptor_sets; - let buffers = &self.buffers()?; - let create_buffer_infos = |buffers: &[Buffer]| { - buffers - .iter() - .map(|buf| { - vk::DescriptorBufferInfo::builder() - .buffer(buf.buffer) - .offset(0) - .range(vk::WHOLE_SIZE) - .build() - }) - .collect::>() - }; - let create_write_descriptor = |i: usize, buffer_infos: &[vk::DescriptorBufferInfo]| { - vk::WriteDescriptorSet::builder() - .dst_set(descriptor_sets.descriptor_sets[i]) - .dst_binding(0) - .descriptor_type(vk::DescriptorType::STORAGE_BUFFER) - .buffer_info(buffer_infos) - .build() - }; - let (buffer_internal_img1, buffer_internal_img2, buffer_out, buffer_out_reverse) = - match direction { - CorrelationDirection::Forward => ( - buffers.buffer_internal_img1, - buffers.buffer_internal_img2, - buffers.buffer_out, - buffers.buffer_out_reverse, - ), - CorrelationDirection::Reverse => ( - buffers.buffer_internal_img2, - buffers.buffer_internal_img1, - buffers.buffer_out_reverse, - buffers.buffer_out, - ), - }; - let regular_buffer_infos = create_buffer_infos(&[ - buffers.buffer_img, - buffer_internal_img1, - buffer_internal_img2, - buffers.buffer_internal_int, - buffer_out, - buffers.buffer_out_corr, - ]); - let cross_check_buffer_infos = create_buffer_infos(&[buffer_out, buffer_out_reverse]); - let write_descriptors = [ - create_write_descriptor(0, regular_buffer_infos.as_slice()), - create_write_descriptor(1, cross_check_buffer_infos.as_slice()), - ]; - unsafe { - self.device.update_descriptor_sets(&write_descriptors, &[]); - } - Ok(()) - } - unsafe fn load_shaders( device: &ash::Device, ) -> Result, Box> { @@ -933,6 +892,63 @@ impl Device { } impl super::Device for Device { + fn set_buffer_direction(&mut self, direction: &CorrelationDirection) -> Result<(), GpuError> { + let descriptor_sets = &self.descriptor_sets; + let buffers = &self.buffers()?; + let create_buffer_infos = |buffers: &[Buffer]| { + buffers + .iter() + .map(|buf| { + vk::DescriptorBufferInfo::builder() + .buffer(buf.buffer) + .offset(0) + .range(vk::WHOLE_SIZE) + .build() + }) + .collect::>() + }; + let create_write_descriptor = |i: usize, buffer_infos: &[vk::DescriptorBufferInfo]| { + vk::WriteDescriptorSet::builder() + .dst_set(descriptor_sets.descriptor_sets[i]) + .dst_binding(0) + .descriptor_type(vk::DescriptorType::STORAGE_BUFFER) + .buffer_info(buffer_infos) + .build() + }; + let (buffer_internal_img1, buffer_internal_img2, buffer_out, buffer_out_reverse) = + match direction { + CorrelationDirection::Forward => ( + buffers.buffer_internal_img1, + buffers.buffer_internal_img2, + buffers.buffer_out, + buffers.buffer_out_reverse, + ), + CorrelationDirection::Reverse => ( + buffers.buffer_internal_img2, + buffers.buffer_internal_img1, + buffers.buffer_out_reverse, + buffers.buffer_out, + ), + }; + let regular_buffer_infos = create_buffer_infos(&[ + buffers.buffer_img, + buffer_internal_img1, + buffer_internal_img2, + buffers.buffer_internal_int, + buffer_out, + buffers.buffer_out_corr, + ]); + let cross_check_buffer_infos = create_buffer_infos(&[buffer_out, buffer_out_reverse]); + let write_descriptors = [ + create_write_descriptor(0, regular_buffer_infos.as_slice()), + create_write_descriptor(1, cross_check_buffer_infos.as_slice()), + ]; + unsafe { + self.device.update_descriptor_sets(&write_descriptors, &[]); + } + Ok(()) + } + unsafe fn run_shader( &mut self, dimensions: (usize, usize),