Skip to content

Commit

Permalink
[hailctl] update to dataproc 2.2 and Spark 3.5.0 (#14158)
Browse files Browse the repository at this point in the history
Fixes #13971

CHANGELOG: Hail now supports and primarily tests against Dataproc 2.2.5,
Spark 3.5.0, and Java 11. We strongly recommend updating to Spark 3.5.0
and Java 11. You should also update your GCS connector *after installing
Hail*: `curl https://broad.io/install-gcs-connector | python3`. Do not
try to update before installing Hail 0.2.131.


https://cloud.google.com/dataproc/docs/concepts/versioning/dataproc-release-2.2

---------

Co-authored-by: Edmund Higham <[email protected]>
  • Loading branch information
danking and ehigham authored Apr 11, 2024
1 parent dcb83f6 commit bd0156d
Show file tree
Hide file tree
Showing 29 changed files with 80 additions and 108 deletions.
4 changes: 2 additions & 2 deletions batch/Dockerfile.worker
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ RUN hail-apt-get-install \
iproute2 \
iptables \
ca-certificates-java \
openjdk-8-jre-headless \
openjdk-11-jre-headless \
liblapack3 \
xfsprogs \
libyajl-dev
Expand Down Expand Up @@ -51,7 +51,7 @@ RUN hail-pip-install \
-r hailtop-requirements.txt \
-r gear-requirements.txt \
-r batch-requirements.txt \
pyspark==3.3.2
pyspark==3.5.0

ENV SPARK_HOME /usr/local/lib/python3.9/dist-packages/pyspark
ENV PATH "$PATH:$SPARK_HOME/sbin:$SPARK_HOME/bin"
Expand Down
2 changes: 1 addition & 1 deletion batch/jvm-entryway/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ repositories {
}

project.ext {
sparkVersion = System.getProperty("spark.version", "3.3.2")
sparkVersion = System.getProperty("spark.version", "3.5.0")
scalaVersion = System.getProperty("scala.version", "2.12.18")
}

Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile.base
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ RUN hail-apt-get-install \
rsync \
emacs-nox \
xsltproc pandoc \
openjdk-8-jdk-headless \
openjdk-11-jdk-headless \
liblapack3 \
liblz4-dev \
g++-10 \
Expand Down
8 changes: 4 additions & 4 deletions docker/core-site.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
<configuration>

<property>
<name>google.cloud.auth.service.account.enable</name>
<value>true</value>
<name>google.cloud.auth.type</name>
<value>SERVICE_ACCOUNT_JSON_KEYFILE</value>
</property>

<property>
<name>google.cloud.auth.service.account.json.keyfile</name>
<value>/gsa-key/key.json</value>
</property>

</configuration>
7 changes: 4 additions & 3 deletions docker/hailgenetics/hail/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@ FROM $BASE_IMAGE
RUN hail-apt-get-install \
git \
liblapack3 \
openjdk-8-jre-headless
openjdk-11-jre-headless

COPY hail/python/pinned-requirements.txt requirements.txt
RUN hail-pip-install -r requirements.txt scikit-learn ipython

RUN export SPARK_HOME=$(find_spark_home.py) && \
curl https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop2-2.2.7.jar \
>$SPARK_HOME/jars/gcs-connector-hadoop2-2.2.7.jar && \
curl --fail --silent --show-error --location \
https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/3.0.0/gcs-connector-3.0.0-shaded.jar \
>${SPARK_HOME}/jars/gcs-connector-3.0.0-shaded.jar && \
mkdir -p $SPARK_HOME/conf && \
touch $SPARK_HOME/conf/spark-defaults.conf && \
sed -i $SPARK_HOME/conf/spark-defaults.conf \
Expand Down
4 changes: 3 additions & 1 deletion hail/Dockerfile.hail-run
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,7 @@ ENV PATH "$PATH:$SPARK_HOME/sbin:$SPARK_HOME/bin"
ENV PYSPARK_PYTHON python3
ENV PYSPARK_SUBMIT_ARGS "--conf spark.kryo.registrator=is.hail.kryo.HailKryoRegistrator pyspark-shell"

RUN curl >${SPARK_HOME}/jars/gcs-connector-hadoop2-2.2.7.jar https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop2-2.2.7.jar
RUN curl --fail --silent --show-error --location \
https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/3.0.0/gcs-connector-3.0.0-shaded.jar \
>${SPARK_HOME}/jars/gcs-connector-3.0.0-shaded.jar
COPY docker/core-site.xml ${SPARK_HOME}/conf/core-site.xml
1 change: 0 additions & 1 deletion hail/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,6 @@ install-editable: $(FAST_PYTHON_JAR) $(FAST_PYTHON_JAR_EXTRA_CLASSPATH)
.PHONY: install-for-qob
install-for-qob: upload-qob-jar install-editable
! [ -z $(NAMESPACE) ] # call this like: make install-for-qob NAMESPACE=default
$(JAVAC) -version 2>&1 | grep -e '1\.8\.0' # install-for-qob requires javac version 1.8, see https://discuss.hail.is/t/on-mac-os-x-how-do-i-install-and-use-java-8-if-i-already-have-a-different-version-of-java-installed/831/2
hailctl config set query/backend batch
hailctl config set query/jar_url $$(cat upload-qob-jar)
hailctl dev config set default_namespace $(NAMESPACE)
Expand Down
10 changes: 7 additions & 3 deletions hail/build.sc
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def javaVersion: T[String] = T.input {
}

def sparkVersion: T[String] = T.input {
Result.Success(T.ctx().env.getOrElse("SPARK_VERSION", "3.3.0"))
Result.Success(T.ctx().env.getOrElse("SPARK_VERSION", "3.5.0"))
}

def debugMode: T[Boolean] = T.input {
Expand Down Expand Up @@ -79,6 +79,10 @@ object Deps {
}

object Breeze {
// WARNING WARNING WARNING
// Before changing the breeze version review:
// - https://hail.zulipchat.com/#narrow/stream/123011-Hail-Query-Dev/topic/new.20spark.20ndarray.20failures/near/41645
// - https://github.com/hail-is/hail/pull/11555
val core = ivy"org.scalanlp::breeze:1.1"
val natives = ivy"org.scalanlp::breeze-natives:1.1"
}
Expand Down Expand Up @@ -210,8 +214,8 @@ object main extends RootModule with HailScalaModule { outer =>
override def compileIvyDeps: T[Agg[Dep]] = Agg(
Deps.log4j,
Deps.hadoopClient,
Deps.Spark.core(),
Deps.Spark.mllib(),
Deps.Spark.core().excludeOrg("org.scalanlp"), // Hail has an explicit dependency on Breeze 1.1
Deps.Spark.mllib().excludeOrg("org.scalanlp"), // Hail has an explicit dependency on Breeze 1.1
Deps.Breeze.core,
)

Expand Down
6 changes: 3 additions & 3 deletions hail/python/dataproc-pre-installed-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# https://cloud.google.com/dataproc/docs/concepts/versioning/dataproc-release-2.1
# https://cloud.google.com/dataproc/docs/concepts/versioning/dataproc-release-2.2
#
# 2.1.33-debian11
pyspark==3.3.2
# 2.2.5-debian12
pyspark==3.5.0
2 changes: 1 addition & 1 deletion hail/python/dev/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ pytest-instafail>=0.4.2,<1
pytest-asyncio>=0.14.0,<0.23
pytest-timestamper>=0.0.9,<1
pytest-timeout>=2.1,<3
pyright>=1.1.324<1.2
pyright>=1.1.349<1.2
sphinx>=6,<7
sphinx-autodoc-typehints==1.23.0
nbsphinx>=0.8.8,<1
Expand Down
12 changes: 11 additions & 1 deletion hail/python/hail/backend/local_backend.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import glob
import logging
import os
import sys
from contextlib import ExitStack
Expand All @@ -17,6 +19,8 @@
from .backend import local_jar_information
from .py4j_backend import Py4JBackend, uninstall_exception_handler

log = logging.getLogger('hail.backend')


class LocalBackend(Py4JBackend):
def __init__(
Expand Down Expand Up @@ -49,12 +53,18 @@ def __init__(
if jvm_heap_size is not None:
jvm_opts.append(f'-Xmx{jvm_heap_size}')

py4j_jars = glob.glob(f'{spark_home}/jars/py4j-*.jar')
if len(py4j_jars) == 0:
raise ValueError(f'No py4j JAR found in {spark_home}/jars')
if len(py4j_jars) > 1:
log.warning(f'found multiple p4yj jars arbitrarily choosing the first one: {py4j_jars}')

port = launch_gateway(
redirect_stdout=sys.stdout,
redirect_stderr=sys.stderr,
java_path=None,
javaopts=jvm_opts,
jarpath=f'{spark_home}/jars/py4j-0.10.9.5.jar',
jarpath=py4j_jars[0],
classpath=extra_classpath,
die_on_exit=True,
)
Expand Down
7 changes: 2 additions & 5 deletions hail/python/hail/docs/getting_started_developing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,8 @@ Hail is an open-source project. We welcome contributions to the repository.
Requirements
~~~~~~~~~~~~

- `Java 8 or 11 JDK <https://adoptopenjdk.net/index.html>`_ . If you have a Mac, you must use a
compatible architecture (``uname -m`` prints your architecture). Moreover, you *must* use Java
**8** or **11**. Hail does not support other versions because `Spark does not support other
versions
<https://spark.apache.org/docs/3.4.0/#:%7E:text=Spark%20runs%20on%20Java%208,3.6%2B%20and%20R%203.5%2B.>`__.
- `Java 11 JDK <https://adoptopenjdk.net/index.html>`_ . If you have a Mac, you must use a
compatible architecture (``uname -m`` prints your architecture).

- The Python and non-pip installation requirements in `Getting Started <getting_started.html>`_.
Note: These instructions install the JRE but that is not necessary as the JDK should already
Expand Down
4 changes: 2 additions & 2 deletions hail/python/hail/docs/install/linux.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Install Hail on GNU/Linux
=========================

- Install Java 8 or Java 11.
- Install Java 11.
- Install Python 3.9 or later.
- Install a recent version of the C and C++ standard libraries. GCC 5.0, LLVM
version 3.4, or any later versions suffice.
Expand All @@ -14,7 +14,7 @@ On a recent Debian-like system, the following should suffice:
.. code-block:: sh
apt-get install -y \
openjdk-8-jre-headless \
openjdk-11-jre-headless \
g++ \
python3.9 python3-pip \
libopenblas-base liblapack3
Expand Down
4 changes: 2 additions & 2 deletions hail/python/hail/docs/install/macosx.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
Install Hail on Mac OS X
========================

- Install Java 8 or 11. We recommend using a `packaged installation from Azul
<https://www.azul.com/downloads/?version=java-8-lts&os=macos&package=jdk&show-old-builds=true>`__
- Install Java 11. We recommend using a `packaged installation from Azul
<https://www.azul.com/downloads/?version=java-11-lts&os=macos&package=jdk&show-old-builds=true>`__
(make sure the OS version and architecture match your system) or using `Homebrew
<https://brew.sh/>`__:

Expand Down
8 changes: 4 additions & 4 deletions hail/python/hail/docs/install/other-cluster.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ Install Hail on a Spark Cluster
If you are using Google Dataproc, please see `these simpler instructions <dataproc.rst>`__. If you
are using Azure HDInsight please see `these simpler instructions <azure.rst>`__.

Hail should work with any Spark 3.3.x cluster built with Scala 2.12.
Hail should work with any Spark 3.5.x cluster built with Scala 2.12.

Hail needs to be built from source on the leader node. Building Hail from source
requires:

- Java 8 or 11 JDK.
- Java 11 JDK.
- Python 3.9 or later.
- A recent C and a C++ compiler, GCC 5.0, LLVM 3.4, or later versions of either
suffice.
Expand All @@ -23,7 +23,7 @@ On a Debian-like system, the following should suffice:
apt-get update
apt-get install \
openjdk-8-jdk-headless \
openjdk-11-jdk-headless \
g++ \
python3 python3-pip \
libopenblas-dev liblapack-dev \
Expand All @@ -36,7 +36,7 @@ The next block of commands downloads, builds, and installs Hail from source.
git clone https://github.com/hail-is/hail.git
cd hail/hail
make install-on-cluster HAIL_COMPILE_NATIVES=1 SCALA_VERSION=2.12.18 SPARK_VERSION=3.3.2
make install-on-cluster HAIL_COMPILE_NATIVES=1 SCALA_VERSION=2.12.18 SPARK_VERSION=3.5.0
If you forget to install any of the requirements before running `make install-on-cluster`, it's possible
to get into a bad state where `make` insists you don't have a requirement that you have in fact installed.
Expand Down
6 changes: 3 additions & 3 deletions hail/python/hail/expr/expressions/typed_expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2944,7 +2944,7 @@ def replace(self, pattern1, pattern2):
-----
The regex expressions used should follow `Java regex syntax
<https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html>`_. In
<https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/util/regex/Pattern.html>`_. In
the Java regular expression syntax, a dollar sign, ``$1``, refers to the
first group, not the canonical ``\\1``.
Expand Down Expand Up @@ -2974,7 +2974,7 @@ def split(self, delim, n=None):
Notes
-----
The delimiter is a regex using the
`Java regex syntax <https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html>`_
`Java regex syntax <https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/util/regex/Pattern.html>`_
delimiter. To split on special characters, escape them with double
backslash (``\\\\``).
Expand Down Expand Up @@ -3222,7 +3222,7 @@ def matches(self, regex, full_match=False):
The `regex` argument is a
`regular expression <https://en.wikipedia.org/wiki/Regular_expression>`__,
and uses
`Java regex syntax <https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html>`__.
`Java regex syntax <https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/util/regex/Pattern.html>`__.
Parameters
----------
Expand Down
4 changes: 2 additions & 2 deletions hail/python/hail/expr/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6581,7 +6581,7 @@ def format(f, *args):
Notes
-----
See the `Java documentation <https://docs.oracle.com/javase/8/docs/api/java/lang/String.html#format-java.lang.String-java.lang.Object...->`__
See the `Java documentation <https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/util/Formatter.html#syntax>`__
for valid format specifiers and arguments.
Missing values are printed as ``'null'`` except when using the
Expand All @@ -6590,7 +6590,7 @@ def format(f, *args):
Parameters
----------
f : :class:`.StringExpression`
Java `format string <https://docs.oracle.com/javase/8/docs/api/java/util/Formatter.html#syntax>`__.
Java `format string <https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/util/Formatter.html#syntax>`__.
args : variable-length arguments of :class:`.Expression`
Arguments to format.
Expand Down
8 changes: 4 additions & 4 deletions hail/python/hail/methods/impex.py
Original file line number Diff line number Diff line change
Expand Up @@ -1033,7 +1033,7 @@ def grep(regex, path, max_count=100, *, show: bool = True, force: bool = False,
convenience to those in the statistical genetics community who often
search enormous text files like VCFs. Hail uses `Java regular expression
patterns
<https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html>`__.
<https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/util/regex/Pattern.html>`__.
The `RegExr sandbox <http://regexr.com/>`__ may be helpful.
Parameters
Expand Down Expand Up @@ -1747,7 +1747,7 @@ def import_table(
find_replace : (:class:`str`, :obj:`str`)
Line substitution regex. Functions like ``re.sub``, but obeys the exact
semantics of Java's
`String.replaceAll <https://docs.oracle.com/javase/8/docs/api/java/lang/String.html#replaceAll-java.lang.String-java.lang.String->`__.
`String.replaceAll <https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/lang/String.html#replaceAll(java.lang.String,java.lang.String)>`__.
force : :obj:`bool`
If ``True``, load gzipped files serially on one core. This should
be used only when absolutely necessary, as processing time will be
Expand Down Expand Up @@ -3030,7 +3030,7 @@ def import_vcf(
find_replace : (:class:`str`, :obj:`str`)
Line substitution regex. Functions like ``re.sub``, but obeys the exact
semantics of Java's
`String.replaceAll <https://docs.oracle.com/javase/8/docs/api/java/lang/String.html#replaceAll-java.lang.String-java.lang.String->`__.
`String.replaceAll <https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/lang/String.html#replaceAll(java.lang.String,java.lang.String)>`__.
n_partitions : :obj:`int`, optional
Number of partitions. If both `n_partitions` and `block_size`
are specified, `n_partitions` will be used.
Expand Down Expand Up @@ -3517,7 +3517,7 @@ def import_csv(
find_replace : (:class:`str`, :obj:`str`)
Line substitution regex. Functions like ``re.sub``, but obeys the exact
semantics of Java's
`String.replaceAll <https://docs.oracle.com/javase/8/docs/api/java/lang/String.html#replaceAll-java.lang.String-java.lang.String->`__.
`String.replaceAll <https://docs.oracle.com/en/java/javase/11/docs/api/java.base/java/lang/String.html#replaceAll(java.lang.String,java.lang.String)>`__.
force : :obj:`bool`
If ``True``, load gzipped files serially on one core. This should
be used only when absolutely necessary, as processing time will be
Expand Down
2 changes: 1 addition & 1 deletion hail/python/hailtop/hailctl/dataproc/start.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ class VepVersion(str, Enum):

ANNOTATION_DB_BUCKETS = ["hail-datasets-us-central1", "hail-datasets-europe-west1"]

IMAGE_VERSION = '2.1.33-debian11'
IMAGE_VERSION = '2.2.5-debian12'


def start(
Expand Down
9 changes: 7 additions & 2 deletions hail/python/pinned-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,12 @@ portalocker==2.8.2
# via
# -c hail/hail/python/hailtop/pinned-requirements.txt
# msal-extensions
py4j==0.10.9.5
protobuf==3.20.2
# via
# -c hail/hail/python/hailtop/pinned-requirements.txt
# -r hail/hail/python/hailtop/requirements.txt
# -r hail/hail/python/requirements.txt
py4j==0.10.9.7
# via pyspark
pyasn1==0.5.1
# via
Expand All @@ -238,7 +243,7 @@ pyjwt[crypto]==2.8.0
# via
# -c hail/hail/python/hailtop/pinned-requirements.txt
# msal
pyspark==3.3.2
pyspark==3.5.0
# via
# -c hail/hail/python/dataproc-pre-installed-requirements.txt
# -r hail/hail/python/requirements.txt
Expand Down
3 changes: 2 additions & 1 deletion hail/python/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ numpy<2
pandas>=2,<3
parsimonious<1
plotly>=5.18.0,<6
pyspark>=3.3.2,<3.4
protobuf==3.20.2
pyspark>=3.5.0,<3.6
requests>=2.31.0,<3
scipy>1.2,<1.12
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@
@qobtest
def test_exceptions_from_workers_have_stack_traces():
ht = hl.utils.range_table(10, n_partitions=10)
ht = ht.annotate(x=hl.int(1) // hl.int(hl.rand_norm(0, 0.1)))
ht = ht.annotate(x=hl.int(1) // hl.int(hl.rand_unif(0, 1)))
pattern = (
'.*'
+ re.escape('java.lang.Math.floorDiv(Math.java:1052)')
+ re.escape('java.lang.Math.floorDiv(Math.java:')
+ '[0-9]+'
+ re.escape(')')
+ '.*'
+ re.escape('(BackendUtils.scala:')
+ '[0-9]+'
Expand Down
Loading

0 comments on commit bd0156d

Please sign in to comment.