Skip to content

Commit

Permalink
tests pass yay
Browse files Browse the repository at this point in the history
  • Loading branch information
andrewgazelka committed Sep 25, 2024
1 parent 45e2944 commit ba181af
Show file tree
Hide file tree
Showing 30 changed files with 583 additions and 113 deletions.
36 changes: 36 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"version": "0.2.0",
"configurations": [
{
"name": "Debug Daft Python/Rust",
"type": "debugpy",
"request": "launch",
"program": "${workspaceFolder}/tests/expressions/test_expressions.py",
"args": [],
"console": "integratedTerminal",
"justMyCode": false,
"env": {
"PYTHONPATH": "${workspaceFolder}"
},
"serverReadyAction": {
"pattern": "pID = ([0-9]+)",
"action": "startDebugging",
"name": "Daft Rust LLDB"
}
},
{
"name": "Daft Rust LLDB",
"pid": "0",
"type": "lldb",
"request": "attach",
"program": "${workspaceFolder}/.venv/bin/python",
"stopOnEntry": false,
"sourceLanguages": [
"rust"
],
"presentation": {
"hidden": true
}
}
]
}
7 changes: 6 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,10 @@
"CARGO_TARGET_DIR": "target/analyzer"
},
"rust-analyzer.check.features": "all",
"rust-analyzer.cargo.features": "all"
"rust-analyzer.cargo.features": "all",
"python.testing.pytestArgs": [
"tests"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
}
1 change: 1 addition & 0 deletions daft/daft/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -1268,6 +1268,7 @@ def dt_truncate(expr: PyExpr, interval: str, relative_to: PyExpr) -> PyExpr: ...
# ---
def explode(expr: PyExpr) -> PyExpr: ...
def list_sort(expr: PyExpr, desc: PyExpr) -> PyExpr: ...
def list_value_counts(expr: PyExpr) -> PyExpr: ...
def list_join(expr: PyExpr, delimiter: PyExpr) -> PyExpr: ...
def list_count(expr: PyExpr, mode: CountMode) -> PyExpr: ...
def list_get(expr: PyExpr, idx: PyExpr, default: PyExpr) -> PyExpr: ...
Expand Down
11 changes: 11 additions & 0 deletions daft/expressions/expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2922,6 +2922,17 @@ def join(self, delimiter: str | Expression) -> Expression:
delimiter_expr = Expression._to_expression(delimiter)
return Expression._from_pyexpr(native.list_join(self._expr, delimiter_expr._expr))

def value_counts(self) -> Expression:
"""Counts the occurrences of each unique value in the list.
Returns:
Expression: A list of structs, where each struct contains a 'value' field
representing a unique element from the original list, and a 'count' field
representing the number of times that value appears in the list.
"""
return Expression._from_pyexpr(native.list_value_counts(self._expr))


def count(self, mode: CountMode = CountMode.Valid) -> Expression:
"""Counts the number of elements in each list
Expand Down
26 changes: 15 additions & 11 deletions src/arrow2/src/array/map/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,20 +41,24 @@ impl MapArray {
try_check_offsets_bounds(&offsets, field.len())?;

let inner_field = Self::try_get_field(&data_type)?;
if let DataType::Struct(inner) = inner_field.data_type() {
if inner.len() != 2 {
return Err(Error::InvalidArgumentError(
"MapArray's inner `Struct` must have 2 fields (keys and maps)".to_string(),
));
}
} else {

let inner_data_type = inner_field.data_type();
let DataType::Struct(inner) = inner_data_type else {
return Err(Error::InvalidArgumentError(
"MapArray expects `DataType::Struct` as its inner logical type".to_string(),
format!("MapArray expects `DataType::Struct` as its inner logical type, but found {inner_data_type:?}"),
));
};

let inner_len = inner.len();
if inner_len != 2 {
let msg = format!("MapArray's inner `Struct` must have 2 fields (keys and maps), but found {} fields", inner_len);
return Err(Error::InvalidArgumentError(msg));
}
if field.data_type() != inner_field.data_type() {

let field_data_type = field.data_type();
if field_data_type != inner_field.data_type() {
return Err(Error::InvalidArgumentError(
"MapArray expects `field.data_type` to match its inner DataType".to_string(),
format!("MapArray expects `field.data_type` to match its inner DataType, but found \n{field_data_type:?}\nvs\n\n\n{inner_field:?}"),
));
}

Expand All @@ -66,7 +70,7 @@ impl MapArray {
"validity mask length must match the number of values",
));
}

Ok(Self {
data_type,
field,
Expand Down
33 changes: 17 additions & 16 deletions src/arrow2/src/offset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ impl<O: Offset> Offsets<O> {

/// Creates a new [`Offsets`] from an iterator of lengths
#[inline]
pub fn try_from_iter<I: IntoIterator<Item = usize>>(iter: I) -> Result<Self, Error> {
pub fn try_from_iter<I: IntoIterator<Item=usize>>(iter: I) -> Result<Self, Error> {
let iterator = iter.into_iter();
let (lower, _) = iterator.size_hint();
let mut offsets = Self::with_capacity(lower);
Expand Down Expand Up @@ -144,10 +144,7 @@ impl<O: Offset> Offsets<O> {
/// Returns the last offset of this container.
#[inline]
pub fn last(&self) -> &O {
match self.0.last() {
Some(element) => element,
None => unsafe { unreachable_unchecked() },
}
self.0.last().unwrap_or_else(|| unsafe { unreachable_unchecked() })
}

/// Returns a range (start, end) corresponding to the position `index`
Expand Down Expand Up @@ -215,7 +212,7 @@ impl<O: Offset> Offsets<O> {
/// # Errors
/// This function errors iff this operation overflows for the maximum value of `O`.
#[inline]
pub fn try_from_lengths<I: Iterator<Item = usize>>(lengths: I) -> Result<Self, Error> {
pub fn try_from_lengths<I: Iterator<Item=usize>>(lengths: I) -> Result<Self, Error> {
let mut self_ = Self::with_capacity(lengths.size_hint().0);
self_.try_extend_from_lengths(lengths)?;
Ok(self_)
Expand All @@ -225,7 +222,7 @@ impl<O: Offset> Offsets<O> {
/// # Errors
/// This function errors iff this operation overflows for the maximum value of `O`.
#[inline]
pub fn try_extend_from_lengths<I: Iterator<Item = usize>>(
pub fn try_extend_from_lengths<I: Iterator<Item=usize>>(
&mut self,
lengths: I,
) -> Result<(), Error> {
Expand Down Expand Up @@ -401,22 +398,26 @@ impl<O: Offset> OffsetsBuffer<O> {
*self.last() - *self.first()
}

pub fn ranges(&self) -> impl Iterator<Item=core::ops::Range<O>> + '_ {
self.0.windows(2).map(|w| {
let from = w[0];
let to = w[1];
debug_assert!(from <= to, "offsets must be monotonically increasing");
from..to
})
}


/// Returns the first offset.
#[inline]
pub fn first(&self) -> &O {
match self.0.first() {
Some(element) => element,
None => unsafe { unreachable_unchecked() },
}
self.0.first().unwrap_or_else(|| unsafe { unreachable_unchecked() })
}

/// Returns the last offset.
#[inline]
pub fn last(&self) -> &O {
match self.0.last() {
Some(element) => element,
None => unsafe { unreachable_unchecked() },
}
self.0.last().unwrap_or_else(|| unsafe { unreachable_unchecked() })
}

/// Returns a range (start, end) corresponding to the position `index`
Expand Down Expand Up @@ -460,7 +461,7 @@ impl<O: Offset> OffsetsBuffer<O> {

/// Returns an iterator with the lengths of the offsets
#[inline]
pub fn lengths(&self) -> impl Iterator<Item = usize> + '_ {
pub fn lengths(&self) -> impl Iterator<Item=usize> + '_ {
self.0.windows(2).map(|w| (w[1] - w[0]).to_usize())
}

Expand Down
1 change: 1 addition & 0 deletions src/daft-core/src/array/fixed_size_list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use crate::{
#[derive(Clone, Debug)]
pub struct FixedSizeListArray {
pub field: Arc<Field>,
/// contains all the elements of the nested lists flattened into a single contiguous array.
pub flat_child: Series,
validity: Option<arrow2::bitmap::Bitmap>,
}
Expand Down
2 changes: 2 additions & 0 deletions src/daft-core/src/array/list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ use crate::{
pub struct ListArray {
pub field: Arc<Field>,
pub flat_child: Series,

/// Where does each row start
offsets: arrow2::offset::OffsetsBuffer<i64>,
validity: Option<arrow2::bitmap::Bitmap>,
}
Expand Down
2 changes: 1 addition & 1 deletion src/daft-core/src/array/ops/arrow2/comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ fn build_is_equal_with_nan(
}
}

fn build_is_equal(
pub fn build_is_equal(
left: &dyn Array,
right: &dyn Array,
nulls_equal: bool,
Expand Down
2 changes: 1 addition & 1 deletion src/daft-core/src/array/ops/cast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2091,7 +2091,7 @@ impl ListArray {
}
}
}
DataType::Map(..) => Ok(MapArray::new(
DataType::Map { .. } => Ok(MapArray::new(
Field::new(self.name(), dtype.clone()),
self.clone(),
)
Expand Down
4 changes: 2 additions & 2 deletions src/daft-core/src/array/ops/from_arrow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ where
// TODO: Consolidate Map to use the same .to_type conversion as other logical types
// Currently, .to_type does not work for Map in Arrow2 because it requires physical types to be equivalent,
// but the physical type of MapArray in Arrow2 is a MapArray, not a ListArray
DataType::Map(..) => arrow_arr,
DataType::Map { .. } => arrow_arr,
_ => arrow_arr.to_type(data_array_field.dtype.to_arrow()?),
};
let physical = <L::PhysicalType as DaftDataType>::ArrayType::from_arrow(
Expand Down Expand Up @@ -98,7 +98,7 @@ impl FromArrow for ListArray {
arrow_arr.validity().cloned(),
))
}
(DataType::List(daft_child_dtype), arrow2::datatypes::DataType::Map(..)) => {
(DataType::List(daft_child_dtype), arrow2::datatypes::DataType::Map { .. }) => {
let map_arr = arrow_arr
.as_any()
.downcast_ref::<arrow2::array::MapArray>()
Expand Down
Loading

0 comments on commit ba181af

Please sign in to comment.