Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GLUTEN-5414][VL] FEAT: Support read CSV #5447

Merged
merged 7 commits into from
May 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 72 additions & 48 deletions .github/workflows/velox_docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -500,12 +500,20 @@ jobs:
container: ghcr.io/facebookincubator/velox-dev:circleci-avx
steps:
- uses: actions/checkout@v2
- name: Build Gluten velox third party
- name: Setup java and maven
run: |
yum install sudo patch java-1.8.0-openjdk-devel wget numactl-devel -y && \
wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz
tar -xvf apache-maven-3.8.8-bin.tar.gz
mv apache-maven-3.8.8 /usr/lib/maven
- name: Build Gluten velox third party
run: |
export MAVEN_HOME=/usr/lib/maven
export PATH=${PATH}:${MAVEN_HOME}/bin
cd ep/build-velox/src && \
./get_velox.sh && \
source /opt/rh/gcc-toolset-9/enable && \
./build_arrow_deps_centos8.sh && \
./build_velox.sh --run_setup_script=OFF --enable_ep_cache=OFF --build_test_utils=ON
- name: Build Gluten CPP library
run: |
Expand All @@ -516,11 +524,6 @@ jobs:
run: |
cd $GITHUB_WORKSPACE/cpp/build && \
ctest -V
- name: Setup java and maven
run: |
wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz
tar -xvf apache-maven-3.8.8-bin.tar.gz
mv apache-maven-3.8.8 /usr/lib/maven
- name: Prepare spark.test.home for Spark 3.2.2 (other tests)
run: |
cd $GITHUB_WORKSPACE/ && \
Expand Down Expand Up @@ -563,23 +566,26 @@ jobs:
container: ghcr.io/facebookincubator/velox-dev:circleci-avx
steps:
- uses: actions/checkout@v2
- name: Setup java and maven
run: |
yum install sudo patch java-1.8.0-openjdk-devel wget numactl-devel -y && \
wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz
tar -xvf apache-maven-3.8.8-bin.tar.gz
mv apache-maven-3.8.8 /usr/lib/maven
- name: Build Gluten velox third party
run: |
yum install sudo patch java-1.8.0-openjdk-devel wget -y && \
export MAVEN_HOME=/usr/lib/maven
export PATH=${PATH}:${MAVEN_HOME}/bin
cd ep/build-velox/src && \
./get_velox.sh && \
source /opt/rh/gcc-toolset-9/enable && \
./build_arrow_deps_centos8.sh && \
./build_velox.sh --run_setup_script=OFF --enable_ep_cache=OFF --build_test_utils=ON
- name: Build Gluten CPP library
run: |
cd $GITHUB_WORKSPACE/cpp && \
source /opt/rh/gcc-toolset-9/enable && \
./compile.sh --build_velox_backend=ON --build_protobuf=ON
- name: Setup java and maven
run: |
wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz
tar -xvf apache-maven-3.8.8-bin.tar.gz
mv apache-maven-3.8.8 /usr/lib/maven
- name: Prepare spark.test.home for Spark 3.2.2 (slow tests)
run: |
cd $GITHUB_WORKSPACE// && \
Expand All @@ -599,23 +605,26 @@ jobs:
container: ghcr.io/facebookincubator/velox-dev:circleci-avx
steps:
- uses: actions/checkout@v2
- name: Build Gluten velox third party
- name: Setup java and maven
run: |
yum install sudo patch java-1.8.0-openjdk-devel wget -y && \
wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz
tar -xvf apache-maven-3.8.8-bin.tar.gz
mv apache-maven-3.8.8 /usr/lib/maven
- name: Build Gluten velox third party
run: |
export MAVEN_HOME=/usr/lib/maven
export PATH=${PATH}:${MAVEN_HOME}/bin
cd ep/build-velox/src && \
./get_velox.sh && \
source /opt/rh/gcc-toolset-9/enable && \
./build_arrow_deps_centos8.sh && \
./build_velox.sh --run_setup_script=OFF --enable_ep_cache=OFF --build_test_utils=ON
- name: Build Gluten CPP library
run: |
cd $GITHUB_WORKSPACE/cpp && \
source /opt/rh/gcc-toolset-9/enable && \
./compile.sh --build_velox_backend=ON --build_protobuf=ON --build_tests=ON --build_examples=ON --build_benchmarks=ON
- name: Setup java and maven
run: |
wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz
tar -xvf apache-maven-3.8.8-bin.tar.gz
mv apache-maven-3.8.8 /usr/lib/maven
- name: Prepare spark.test.home for Spark 3.3.1 (other tests)
run: |
cd $GITHUB_WORKSPACE/ && \
Expand Down Expand Up @@ -654,23 +663,26 @@ jobs:
container: ghcr.io/facebookincubator/velox-dev:circleci-avx
steps:
- uses: actions/checkout@v2
- name: Setup java and maven
run: |
yum install sudo patch java-1.8.0-openjdk-devel wget numactl-devel -y && \
wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz
tar -xvf apache-maven-3.8.8-bin.tar.gz
mv apache-maven-3.8.8 /usr/lib/maven
- name: Build Gluten velox third party
run: |
yum install sudo patch java-1.8.0-openjdk-devel wget -y && \
export MAVEN_HOME=/usr/lib/maven
export PATH=${PATH}:${MAVEN_HOME}/bin
cd ep/build-velox/src && \
./get_velox.sh && \
source /opt/rh/gcc-toolset-9/enable && \
./build_arrow_deps_centos8.sh && \
./build_velox.sh --run_setup_script=OFF --enable_ep_cache=OFF --build_test_utils=ON
- name: Build Gluten CPP library
run: |
cd $GITHUB_WORKSPACE/cpp && \
source /opt/rh/gcc-toolset-9/enable && \
./compile.sh --build_velox_backend=ON --build_protobuf=ON
- name: Setup java and maven
run: |
wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz
tar -xvf apache-maven-3.8.8-bin.tar.gz
mv apache-maven-3.8.8 /usr/lib/maven
- name: Prepare spark.test.home for Spark 3.3.1 (slow tests)
run: |
cd $GITHUB_WORKSPACE// && \
Expand All @@ -690,23 +702,26 @@ jobs:
container: ghcr.io/facebookincubator/velox-dev:circleci-avx
steps:
- uses: actions/checkout@v2
- name: Build Gluten velox third party
- name: Setup java and maven
run: |
yum install sudo patch java-1.8.0-openjdk-devel wget -y && \
wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz
tar -xvf apache-maven-3.8.8-bin.tar.gz
mv apache-maven-3.8.8 /usr/lib/maven
- name: Build Gluten velox third party
run: |
export MAVEN_HOME=/usr/lib/maven
export PATH=${PATH}:${MAVEN_HOME}/bin
cd ep/build-velox/src && \
./get_velox.sh && \
source /opt/rh/gcc-toolset-9/enable && \
./build_arrow_deps_centos8.sh && \
./build_velox.sh --run_setup_script=OFF --enable_ep_cache=OFF --build_test_utils=ON
- name: Build Gluten CPP library
run: |
cd $GITHUB_WORKSPACE/cpp && \
source /opt/rh/gcc-toolset-9/enable && \
./compile.sh --build_velox_backend=ON --build_protobuf=ON --build_tests=ON --build_examples=ON --build_benchmarks=ON
- name: Setup java and maven
run: |
wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz
tar -xvf apache-maven-3.8.8-bin.tar.gz
mv apache-maven-3.8.8 /usr/lib/maven
- name: Prepare spark.test.home for Spark 3.4.2 (other tests)
run: |
cd $GITHUB_WORKSPACE/ && \
Expand Down Expand Up @@ -745,23 +760,26 @@ jobs:
container: ghcr.io/facebookincubator/velox-dev:circleci-avx
steps:
- uses: actions/checkout@v2
- name: Build Gluten velox third party
- name: Setup java and maven
run: |
yum install sudo patch java-1.8.0-openjdk-devel wget -y && \
wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz
tar -xvf apache-maven-3.8.8-bin.tar.gz
mv apache-maven-3.8.8 /usr/lib/maven
- name: Build Gluten velox third party
run: |
export MAVEN_HOME=/usr/lib/maven
export PATH=${PATH}:${MAVEN_HOME}/bin
cd ep/build-velox/src && \
./get_velox.sh && \
source /opt/rh/gcc-toolset-9/enable && \
./build_arrow_deps_centos8.sh && \
./build_velox.sh --run_setup_script=OFF --enable_ep_cache=OFF --build_test_utils=ON
- name: Build Gluten CPP library
run: |
cd $GITHUB_WORKSPACE/cpp && \
source /opt/rh/gcc-toolset-9/enable && \
./compile.sh --build_velox_backend=ON --build_protobuf=ON
- name: Setup java and maven
run: |
wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz
tar -xvf apache-maven-3.8.8-bin.tar.gz
mv apache-maven-3.8.8 /usr/lib/maven
- name: Prepare spark.test.home for Spark 3.4.2 (slow tests)
run: |
cd $GITHUB_WORKSPACE// && \
Expand All @@ -781,23 +799,26 @@ jobs:
container: ghcr.io/facebookincubator/velox-dev:circleci-avx
steps:
- uses: actions/checkout@v2
- name: Build Gluten velox third party
- name: Setup java and maven
run: |
yum install sudo patch java-1.8.0-openjdk-devel wget -y && \
wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz
tar -xvf apache-maven-3.8.8-bin.tar.gz
mv apache-maven-3.8.8 /usr/lib/maven
- name: Build Gluten velox third party
run: |
export MAVEN_HOME=/usr/lib/maven && \
export PATH=${PATH}:${MAVEN_HOME}/bin && \
cd ep/build-velox/src && \
./get_velox.sh && \
source /opt/rh/gcc-toolset-9/enable && \
./build_arrow_deps_centos8.sh && \
./build_velox.sh --run_setup_script=OFF --enable_ep_cache=OFF --build_test_utils=ON
- name: Build Gluten CPP library
run: |
cd $GITHUB_WORKSPACE/cpp && \
source /opt/rh/gcc-toolset-9/enable && \
./compile.sh --build_velox_backend=ON --build_protobuf=ON --build_tests=ON --build_examples=ON --build_benchmarks=ON
- name: Setup java and maven
run: |
wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz
tar -xvf apache-maven-3.8.8-bin.tar.gz
mv apache-maven-3.8.8 /usr/lib/maven
- name: Prepare spark.test.home for Spark 3.5.1 (other tests)
run: |
cd $GITHUB_WORKSPACE/ && \
Expand Down Expand Up @@ -835,23 +856,26 @@ jobs:
container: ghcr.io/facebookincubator/velox-dev:circleci-avx
steps:
- uses: actions/checkout@v2
- name: Build Gluten velox third party
- name: Setup java and maven
run: |
yum install sudo patch java-1.8.0-openjdk-devel wget -y && \
wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz
tar -xvf apache-maven-3.8.8-bin.tar.gz
mv apache-maven-3.8.8 /usr/lib/maven
- name: Build Gluten velox third party
run: |
export MAVEN_HOME=/usr/lib/maven
export PATH=${PATH}:${MAVEN_HOME}/bin
cd ep/build-velox/src && \
./get_velox.sh && \
source /opt/rh/gcc-toolset-9/enable && \
./build_arrow_deps_centos8.sh && \
./build_velox.sh --run_setup_script=OFF --enable_ep_cache=OFF --build_test_utils=ON
- name: Build Gluten CPP library
run: |
cd $GITHUB_WORKSPACE/cpp && \
source /opt/rh/gcc-toolset-9/enable && \
./compile.sh --build_velox_backend=ON --build_protobuf=ON --build_tests=ON --build_examples=ON --build_benchmarks=ON
- name: Setup java and maven
run: |
wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz
tar -xvf apache-maven-3.8.8-bin.tar.gz
mv apache-maven-3.8.8 /usr/lib/maven
- name: Prepare spark.test.home for Spark 3.5.1 (other tests)
run: |
cd $GITHUB_WORKSPACE/ && \
Expand Down
8 changes: 5 additions & 3 deletions .github/workflows/velox_velox_ut.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@ name: Velox backend Velox Unit test
on:
pull_request:
paths:
- '.github/workflows/velox_velox_ut.yml'
# TODO: wait to fix
# - '.github/workflows/velox_velox_ut.yml'
- 'dev/**'
- 'ep/**' #get_velox change
# - 'ep/**' #get_velox change

concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
Expand Down Expand Up @@ -64,4 +65,5 @@ jobs:

- name: Run Tests
run: |
cd ${GITHUB_WORKSPACE}/ep/build-velox/build/velox_ep/_build/release && ctest -E velox_cache_test -j 4 --output-on-failure --no-tests=error
ccache -c
cd ${GITHUB_WORKSPACE}/ep/build-velox/build/velox_ep/_build/release && ctest -E velox_cache_test -j 4 --output-on-failure --no-tests=error
Original file line number Diff line number Diff line change
Expand Up @@ -617,6 +617,10 @@ class CHSparkPlanExecApi extends SparkPlanExecApi {
override def genExtendedColumnarPostRules(): List[SparkSession => Rule[SparkPlan]] =
List(spark => NativeWritePostRule(spark))

override def genInjectPostHocResolutionRules(): List[SparkSession => Rule[LogicalPlan]] = {
List()
}

/**
* Generate extended Strategies.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,10 @@ object VeloxBackendSettings extends BackendSettingsApi {
)
}

override def enableNativeArrowReadFiles(): Boolean = {
GlutenConfig.getConf.enableNativeArrowReader
}

override def shouldRewriteCount(): Boolean = {
// Velox backend does not support count if it has more that one child,
// so we should rewrite it.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package org.apache.gluten.backendsapi.velox

import org.apache.gluten.GlutenConfig
import org.apache.gluten.backendsapi.SparkPlanExecApi
import org.apache.gluten.datasource.ArrowConvertorRule
import org.apache.gluten.exception.GlutenNotSupportException
import org.apache.gluten.execution._
import org.apache.gluten.expression._
Expand Down Expand Up @@ -767,6 +768,10 @@ class VeloxSparkPlanExecApi extends SparkPlanExecApi {
SparkShimLoader.getSparkShims.getExtendedColumnarPostRules() ::: List()
}

override def genInjectPostHocResolutionRules(): List[SparkSession => Rule[LogicalPlan]] = {
List(ArrowConvertorRule)
}

/**
* Generate extended Strategy.
*
Expand Down
Loading
Loading