diff --git a/crates/polars-core/src/frame/row/mod.rs b/crates/polars-core/src/frame/row/mod.rs index 7e899fbb2660..4f9e0e84fd6e 100644 --- a/crates/polars-core/src/frame/row/mod.rs +++ b/crates/polars-core/src/frame/row/mod.rs @@ -11,7 +11,7 @@ pub use av_buffer::*; use rayon::prelude::*; use crate::prelude::*; -use crate::utils::try_get_supertype; +use crate::utils::{dtypes_to_supertype, try_get_supertype}; use crate::POOL; #[derive(Debug, Clone, PartialEq, Eq, Default)] @@ -83,33 +83,15 @@ pub fn coerce_data_type>(datatypes: &[A]) -> DataType { try_get_supertype(lhs, rhs).unwrap_or(String) } -pub fn any_values_to_dtype(column: &[AnyValue]) -> PolarsResult<(DataType, usize)> { - // we need an index-map as the order of dtypes influences how the - // struct fields are constructed. - let mut types_set = PlIndexSet::new(); - for val in column.iter() { - types_set.insert(val.into()); - } - let n_types = types_set.len(); - Ok((types_set_to_dtype(types_set)?, n_types)) -} - -fn types_set_to_dtype(types_set: PlIndexSet) -> PolarsResult { - types_set - .into_iter() - .map(Ok) - .reduce(|a, b| try_get_supertype(&a?, &b?)) - .unwrap() -} - /// Infer schema from rows and set the supertypes of the columns as column data type. pub fn rows_to_schema_supertypes( rows: &[Row], infer_schema_length: Option, ) -> PolarsResult { + polars_ensure!(!rows.is_empty(), NoData: "no rows, cannot infer schema"); + // no of rows to use to infer dtype let max_infer = infer_schema_length.unwrap_or(rows.len()); - polars_ensure!(!rows.is_empty(), NoData: "no rows, cannot infer schema"); let mut dtypes: Vec> = vec![PlIndexSet::new(); rows[0].0.len()]; for row in rows.iter().take(max_infer) { @@ -125,7 +107,7 @@ pub fn rows_to_schema_supertypes( let dtype = if types_set.is_empty() { DataType::Unknown } else { - types_set_to_dtype(types_set)? + dtypes_to_supertype(&types_set)? }; Ok(Field::new(format!("column_{i}").as_ref(), dtype)) }) diff --git a/crates/polars-core/src/series/any_value.rs b/crates/polars-core/src/series/any_value.rs index e3f2f0ce2b57..178148ec4446 100644 --- a/crates/polars-core/src/series/any_value.rs +++ b/crates/polars-core/src/series/any_value.rs @@ -3,7 +3,7 @@ use std::fmt::Write; #[cfg(feature = "object")] use crate::chunked_array::object::registry::ObjectRegistry; use crate::prelude::*; -use crate::utils::try_get_supertype; +use crate::utils::any_values_to_supertype; impl<'a, T: AsRef<[AnyValue<'a>]>> NamedFrom]> for Series { fn new(name: &str, v: T) -> Self { @@ -47,28 +47,10 @@ impl Series { }, } } - fn get_any_values_supertype(values: &[AnyValue]) -> PolarsResult { - let mut supertype = DataType::Null; - let mut dtypes = PlHashSet::::new(); - for av in values { - if dtypes.insert(av.dtype()) { - supertype = try_get_supertype(&supertype, &av.dtype()).map_err(|_| { - polars_err!( - SchemaMismatch: - "failed to infer supertype of values; partial supertype is {:?}, found value of type {:?}: {}", - supertype, av.dtype(), av - ) - } - )?; - } - } - Ok(supertype) - } - let dtype = if strict { get_first_non_null_dtype(values) } else { - get_any_values_supertype(values)? + any_values_to_supertype(values)? }; Self::from_any_values_and_dtype(name, values, &dtype, strict) } diff --git a/crates/polars-core/src/utils/any_value.rs b/crates/polars-core/src/utils/any_value.rs new file mode 100644 index 000000000000..5a9688cc1394 --- /dev/null +++ b/crates/polars-core/src/utils/any_value.rs @@ -0,0 +1,37 @@ +use crate::prelude::*; +use crate::utils::dtypes_to_supertype; + +/// Determine the supertype of a collection of [`AnyValue`]. +/// +/// [`AnyValue`]: crate::datatypes::AnyValue +pub fn any_values_to_supertype<'a, I>(values: I) -> PolarsResult +where + I: IntoIterator>, +{ + let dtypes = any_values_to_dtype_set(values); + dtypes_to_supertype(&dtypes) +} + +/// Determine the supertype and the number of unique data types of a collection of [`AnyValue`]. +/// +/// [`AnyValue`]: crate::datatypes::AnyValue +pub fn any_values_to_supertype_and_n_dtypes<'a, I>(values: I) -> PolarsResult<(DataType, usize)> +where + I: IntoIterator>, +{ + let dtypes = any_values_to_dtype_set(values); + let supertype = dtypes_to_supertype(&dtypes)?; + let n_dtypes = dtypes.len(); + Ok((supertype, n_dtypes)) +} + +/// Extract the ordered set of data types from a collection of AnyValues +/// +/// Retaining the order is important if the set is used to determine a supertype, +/// as this can influence how Struct fields are constructed. +fn any_values_to_dtype_set<'a, I>(values: I) -> PlIndexSet +where + I: IntoIterator>, +{ + values.into_iter().map(|av| av.into()).collect() +} diff --git a/crates/polars-core/src/utils/mod.rs b/crates/polars-core/src/utils/mod.rs index 820589c2600c..f4d1d800e74f 100644 --- a/crates/polars-core/src/utils/mod.rs +++ b/crates/polars-core/src/utils/mod.rs @@ -1,9 +1,11 @@ +mod any_value; pub mod flatten; pub(crate) mod series; mod supertype; use std::borrow::Cow; use std::ops::{Deref, DerefMut}; +pub use any_value::*; use arrow::bitmap::bitmask::BitMask; use arrow::bitmap::Bitmap; pub use arrow::legacy::utils::*; diff --git a/crates/polars-core/src/utils/supertype.rs b/crates/polars-core/src/utils/supertype.rs index f6878fe419bc..a46c9390b81c 100644 --- a/crates/polars-core/src/utils/supertype.rs +++ b/crates/polars-core/src/utils/supertype.rs @@ -1,13 +1,17 @@ use super::*; -/// Given two datatypes, determine the supertype that both types can safely be cast to +/// Given two data types, determine the data type that both types can safely be cast to. +/// +/// Returns a [`PolarsError::ComputeError`] if no such data type exists. pub fn try_get_supertype(l: &DataType, r: &DataType) -> PolarsResult { get_supertype(l, r).ok_or_else( || polars_err!(ComputeError: "failed to determine supertype of {} and {}", l, r), ) } -/// Given two datatypes, determine the supertype that both types can safely be cast to +/// Given two data types, determine the data type that both types can safely be cast to. +/// +/// Returns [`None`] if no such data type exists. pub fn get_supertype(l: &DataType, r: &DataType) -> Option { fn inner(l: &DataType, r: &DataType) -> Option { use DataType::*; @@ -278,6 +282,20 @@ pub fn get_supertype(l: &DataType, r: &DataType) -> Option { inner(l, r).or_else(|| inner(r, l)) } +/// Given multiple data types, determine the data type that all types can safely be cast to. +/// +/// Returns [`DataType::Null`] if no data types were passed. +pub fn dtypes_to_supertype<'a, I>(dtypes: I) -> PolarsResult +where + I: IntoIterator, +{ + dtypes + .into_iter() + .try_fold(DataType::Null, |supertype, dtype| { + try_get_supertype(&supertype, dtype) + }) +} + #[cfg(feature = "dtype-struct")] fn union_struct_fields(fields_a: &[Field], fields_b: &[Field]) -> Option { let (longest, shortest) = { diff --git a/py-polars/src/conversion/any_value.rs b/py-polars/src/conversion/any_value.rs index 0e9e73961ace..4cbdd3d5ac01 100644 --- a/py-polars/src/conversion/any_value.rs +++ b/py-polars/src/conversion/any_value.rs @@ -2,7 +2,7 @@ use polars::chunked_array::object::PolarsObjectSafe; use polars::datatypes::{DataType, Field, OwnedObject, PlHashMap, TimeUnit}; use polars::prelude::{AnyValue, Series}; -use polars_core::frame::row::any_values_to_dtype; +use polars_core::utils::any_values_to_supertype_and_n_dtypes; use pyo3::exceptions::{PyOverflowError, PyTypeError}; use pyo3::intern; use pyo3::prelude::*; @@ -282,11 +282,11 @@ pub(crate) fn py_object_to_any_value(ob: &PyAny, strict: bool) -> PyResult