From ed2d0286f2e3102d507aed87ed8ecbc4e2a642cf Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Tue, 27 Aug 2024 14:51:02 +0900 Subject: [PATCH] [SPARK-49402][PYTHON] Fix Binder integration in PySpark documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR proposes to fix Binder integration by using `Dockerfile` directly. Binder integration is broken now (https://mybinder.org/v2/gh/apache/spark/bb7846dd487?filepath=python%2Fdocs%2Fsource%2Fgetting_started%2Fquickstart_df.ipynb): ![Screenshot 2024-08-27 at 2 04 35 PM](https://github.com/user-attachments/assets/29222fc2-7cc6-43fa-8e04-a65c8384c4d5) This seems to be related to the size of the repository (https://github.com/jupyterhub/mybinder.org-deploy/issues/3074). I tried all the ways out but could not find the way except using `Dockerfile`. Yes. This should recover the Binder integration. Manually tested within my fork: https://mybinder.org/v2/gh/HyukjinKwon/spark/binder-test1?filepath=python%2Fdocs%2Fsource%2Fgetting_started%2Fquickstart_df.ipynb No. Closes #47883 from HyukjinKwon/binder-test1. Authored-by: Hyukjin Kwon Signed-off-by: Hyukjin Kwon (cherry picked from commit 9fc1e05224d2b4207885a1d4f822fed6db0cb1a1) Signed-off-by: Hyukjin Kwon --- binder/Dockerfile | 43 +++++++++++++++++++++++++++++++++++++++++++ binder/apt.txt | 2 -- binder/postBuild | 2 +- 3 files changed, 44 insertions(+), 3 deletions(-) create mode 100644 binder/Dockerfile delete mode 100644 binder/apt.txt mode change 100644 => 100755 binder/postBuild diff --git a/binder/Dockerfile b/binder/Dockerfile new file mode 100644 index 0000000000000..a0af7312ef4ff --- /dev/null +++ b/binder/Dockerfile @@ -0,0 +1,43 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +FROM python:3.10-slim +# install the notebook package +RUN pip install --no-cache notebook jupyterlab + +# create user with a home directory +ARG NB_USER +ARG NB_UID +ENV USER ${NB_USER} +ENV HOME /home/${NB_USER} + +RUN adduser --disabled-password \ + --gecos "Default user" \ + --uid ${NB_UID} \ + ${NB_USER} +WORKDIR ${HOME} +USER ${USER} + +# Make sure the contents of our repo are in ${HOME} +COPY . ${HOME} +USER root +RUN chown -R ${NB_UID} ${HOME} +RUN apt-get update && apt-get install -y openjdk-8-jre git coreutils +USER ${NB_USER} + +RUN binder/postBuild + diff --git a/binder/apt.txt b/binder/apt.txt deleted file mode 100644 index 3d86667d4b910..0000000000000 --- a/binder/apt.txt +++ /dev/null @@ -1,2 +0,0 @@ -openjdk-8-jre -git diff --git a/binder/postBuild b/binder/postBuild old mode 100644 new mode 100755 index 70ae23b393707..9cf90a1038a62 --- a/binder/postBuild +++ b/binder/postBuild @@ -21,7 +21,7 @@ # Jupyter notebook. VERSION=$(python -c "exec(open('python/pyspark/version.py').read()); print(__version__)") -TAG=$(git describe --tags --exact-match 2>/dev/null) +TAG=$(git describe --tags --exact-match 2> /dev/null || true) # If a commit is tagged, exactly specified version of pyspark should be installed to avoid # a kind of accident that an old version of pyspark is installed in the live notebook environment.