Eventual-Inc · xcharleslin · Aug 2, 2023 · Aug 1, 2023 · Aug 1, 2023 · Aug 1, 2023
diff --git a/daft/context.py b/daft/context.py
@@ -56,7 +56,7 @@ def _get_runner_config_from_env() -> _RunnerConfig:
 
 def _get_planner_from_env() -> bool:
     """Returns whether or not to use the new query planner."""
-    return bool(int(os.getenv("DAFT_DEVELOPER_RUST_QUERY_PLANNER", default="1")))
+    return bool(int(os.getenv("DAFT_DEVELOPER_RUST_QUERY_PLANNER", default="0")))
 
 
 @dataclasses.dataclass(frozen=True)

diff --git a/daft/dataframe/dataframe.py b/daft/dataframe/dataframe.py
@@ -29,7 +29,7 @@
 from daft.datatype import DataType
 from daft.errors import ExpressionTypeError
 from daft.expressions import Expression, ExpressionsProjection, col, lit
-from daft.logical import logical_plan
+from daft.logical import logical_plan, rust_logical_plan
 from daft.logical.aggregation_plan_builder import AggregationPlanBuilder
 from daft.resource_request import ResourceRequest
 from daft.runners.partitioning import PartitionCacheEntry, PartitionSet
@@ -65,7 +65,7 @@ def __init__(self, plan: logical_plan.LogicalPlan) -> None:
         Args:
             plan: LogicalPlan describing the steps required to arrive at this DataFrame
         """
-        if not isinstance(plan, logical_plan.LogicalPlan):
+        if not isinstance(plan, (logical_plan.LogicalPlan, rust_logical_plan.RustLogicalPlanBuilder)):
             if isinstance(plan, dict):
                 raise ValueError(
                     f"DataFrames should be constructed with a dictionary of columns using `daft.from_pydict`"

diff --git a/daft/io/common.py b/daft/io/common.py
@@ -3,9 +3,10 @@
 import fsspec
 
 from daft.context import get_context
+from daft.daft import LogicalPlanBuilder
 from daft.datasources import SourceInfo
 from daft.datatype import DataType
-from daft.logical import logical_plan
+from daft.logical import logical_plan, rust_logical_plan
 from daft.logical.schema import Schema
 
 
@@ -57,3 +58,35 @@
         # one partition per filepath. This will change in the future and our logic here should change accordingly.
         num_partitions=len(listing_details_partition_set),
     )
+
+
+def _get_files_scan_rustplan(
+    path: str | list[str],
+    schema_hints: dict[str, DataType] | None,
+    source_info: SourceInfo,
+    fs: fsspec.AbstractFileSystem | None,
+) -> rust_logical_plan.RustLogicalPlanBuilder:
+    """Returns a LogicalPlanBuilder with the file scan."""
+    # Glob the path using the Runner
+    runner_io = get_context().runner().runner_io()
+
+    paths = path if isinstance(path, list) else [str(path)]
+    listing_details_partition_set = runner_io.glob_paths_details(paths, source_info, fs)
+
+    # Infer schema if no hints provided
+    inferred_or_provided_schema = (
+        _get_schema_from_hints(schema_hints)
+        if schema_hints is not None
+        else runner_io.get_schema_from_first_filepath(listing_details_partition_set, source_info, fs)
+    )
+
+    # Construct plan
+    paths_details = listing_details_partition_set.to_pydict()
+
+    filepaths = paths_details[runner_io.FS_LISTING_PATH_COLUMN_NAME]
+    rs_schema = inferred_or_provided_schema._schema
+
+    builder = LogicalPlanBuilder.read_parquet(filepaths, rs_schema)
+    pybuilder = rust_logical_plan.RustLogicalPlanBuilder(builder)
+
+    return pybuilder
diff --git a/daft/io/parquet.py b/daft/io/parquet.py
@@ -1,14 +1,16 @@
 # isort: dont-add-import: from __future__ import annotations
 
-from typing import TYPE_CHECKING, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Union, cast
 
 import fsspec
 
 from daft.api_annotations import PublicAPI
+from daft.context import get_context
 from daft.dataframe import DataFrame
 from daft.datasources import ParquetSourceInfo
 from daft.datatype import DataType
-from daft.io.common import _get_tabular_files_scan
+from daft.io.common import _get_files_scan_rustplan, _get_tabular_files_scan
+from daft.logical.logical_plan import LogicalPlan
 
 if TYPE_CHECKING:
     from daft.io import IOConfig
@@ -43,16 +45,34 @@
     returns:
         DataFrame: parsed DataFrame
     """
+
     if isinstance(path, list) and len(path) == 0:
         raise ValueError(f"Cannot read DataFrame from from empty list of Parquet filepaths")
 
-    plan = _get_tabular_files_scan(
-        path,
-        schema_hints,
-        ParquetSourceInfo(
-            io_config=io_config,
-            use_native_downloader=use_native_downloader,
-        ),
-        fs,
-    )
+    context = get_context()
+
+    if context.use_rust_planner:
+        plan = cast(
+            LogicalPlan,
+            _get_files_scan_rustplan(
+                path,
+                schema_hints,
+                ParquetSourceInfo(
+                    io_config=io_config,
+                    use_native_downloader=use_native_downloader,
+                ),
+                fs,
+            ),
+        )  # Cast for temporary type checking.
+    else:
+        plan = _get_tabular_files_scan(
+            path,
+            schema_hints,
+            ParquetSourceInfo(
+                io_config=io_config,
+                use_native_downloader=use_native_downloader,
+            ),
+            fs,
+        )
+
     return DataFrame(plan)
diff --git a/daft/logical/rust_logical_plan.py b/daft/logical/rust_logical_plan.py
@@ -0,0 +1,15 @@
+from __future__ import annotations
+
+from daft.daft import LogicalPlanBuilder
+from daft.logical.schema import Schema
+
+
+class RustLogicalPlanBuilder:
+    """Wrapper class for the new LogicalPlanBuilder in Rust."""
+
+    def __init__(self, builder: LogicalPlanBuilder) -> None:
+        self.builder = builder
+
+    def schema(self) -> Schema:
+        pyschema = self.builder.schema()
+        return Schema._from_pyschema(pyschema)
diff --git a/src/daft-plan/src/builder.rs b/src/daft-plan/src/builder.rs
@@ -1,7 +1,8 @@
 use std::sync::Arc;
 
 use crate::logical_plan::LogicalPlan;
-use crate::{ops, source_info};
+use crate::ops;
+use crate::source_info::{FileFormat, FilesInfo, SourceInfo};
 
 #[cfg(feature = "python")]
 use daft_core::python::schema::PySchema;
@@ -10,14 +11,14 @@ use pyo3::prelude::*;
 
 #[cfg_attr(feature = "python", pyclass)]
 pub struct LogicalPlanBuilder {
-    _plan: Arc<LogicalPlan>,
+    plan: Arc<LogicalPlan>,
 }
 
 impl LogicalPlanBuilder {
     // Create a new LogicalPlanBuilder for a Source node.
     pub fn from_source(source: ops::Source) -> Self {
         Self {
-            _plan: LogicalPlan::Source(source).into(),
+            plan: LogicalPlan::Source(source).into(),
         }
     }
 }
@@ -26,8 +27,9 @@ impl LogicalPlanBuilder {
 #[pymethods]
 impl LogicalPlanBuilder {
     #[staticmethod]
-    pub fn source(filepaths: Vec<String>, schema: &PySchema) -> PyResult<LogicalPlanBuilder> {
-        let source_info = source_info::SourceInfo::FilesInfo(source_info::FilesInfo::new(
+    pub fn read_parquet(filepaths: Vec<String>, schema: &PySchema) -> PyResult<LogicalPlanBuilder> {
+        let source_info = SourceInfo::FilesInfo(FilesInfo::new(
+            FileFormat::Parquet,
             filepaths,
             schema.schema.clone(),
         ));
@@ -37,4 +39,8 @@ impl LogicalPlanBuilder {
         ));
         Ok(logical_plan_builder)
     }
+
+    pub fn schema(&self) -> PyResult<PySchema> {
+        Ok(self.plan.schema().into())
+    }
 }
diff --git a/src/daft-plan/src/logical_plan.rs b/src/daft-plan/src/logical_plan.rs
@@ -1,6 +1,16 @@
+use daft_core::schema::SchemaRef;
+
 use crate::ops::*;
 
 #[derive(Clone)]
 pub enum LogicalPlan {
     Source(Source),
 }
+
+impl LogicalPlan {
+    pub fn schema(&self) -> SchemaRef {
+        match self {
+            Self::Source(Source { schema, .. }) => schema.clone(),
+        }
+    }
+}
diff --git a/src/daft-plan/src/source_info.rs b/src/daft-plan/src/source_info.rs
@@ -13,13 +13,24 @@ impl SourceInfo {
     }
 }
 
+pub enum FileFormat {
+    Parquet,
+    Csv,
+    Json,
+}
+
 pub struct FilesInfo {
+    pub file_format: FileFormat,
     pub filepaths: Vec<String>, // TODO: pull in some sort of URL crate for this
     pub schema: SchemaRef,
 }
 
 impl FilesInfo {
-    pub(crate) fn new(filepaths: Vec<String>, schema: SchemaRef) -> Self {
-        Self { filepaths, schema }
+    pub(crate) fn new(file_format: FileFormat, filepaths: Vec<String>, schema: SchemaRef) -> Self {
+        Self {
+            file_format,
+            filepaths,
+            schema,
+        }
     }
 }