Coverage for pyspark/install.py: 93%

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

# Licensed to the Apache Software Foundation (ASF) under one or more

# contributor license agreements. See the NOTICE file distributed with

# this work for additional information regarding copyright ownership.

# The ASF licenses this file to You under the Apache License, Version 2.0

# (the "License"); you may not use this file except in compliance with

# the License. You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software

# distributed under the License is distributed on an "AS IS" BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the License for the specific language governing permissions and

# limitations under the License.

import os

import re

import tarfile

import traceback

import urllib.request

from shutil import rmtree

# NOTE that we shouldn't import pyspark here because this is used in

# setup.py, and assume there's no PySpark imported.

DEFAULT_HADOOP = "hadoop3.2"

DEFAULT_HIVE = "hive2.3"

SUPPORTED_HADOOP_VERSIONS = ["hadoop2.7", "hadoop3.2", "without-hadoop"]

SUPPORTED_HIVE_VERSIONS = ["hive2.3"]

UNSUPPORTED_COMBINATIONS = [ # type: ignore

]

def checked_package_name(spark_version, hadoop_version, hive_version):

return "%s-bin-%s" % (spark_version, hadoop_version)

def checked_versions(spark_version, hadoop_version, hive_version):

"""

Check the valid combinations of supported versions in Spark distributions.

Parameters

----------

spark_version : str

Spark version. It should be X.X.X such as '3.0.0' or spark-3.0.0.

hadoop_version : str

Hadoop version. It should be X.X such as '2.7' or 'hadoop2.7'.

'without' and 'without-hadoop' are supported as special keywords for Hadoop free

distribution.

hive_version : str

Hive version. It should be X.X such as '2.3' or 'hive2.3'.

Parameters

----------

tuple

fully-qualified versions of Spark, Hadoop and Hive in a tuple.

For example, spark-3.0.0, hadoop3.2 and hive2.3.

"""

if re.match("^[0-9]+\\.[0-9]+\\.[0-9]+$", spark_version):

spark_version = "spark-%s" % spark_version

if not spark_version.startswith("spark-"):

raise RuntimeError(

"Spark version should start with 'spark-' prefix; however, "

"got %s" % spark_version)

if hadoop_version == "without":

hadoop_version = "without-hadoop"

elif re.match("^[0-9]+\\.[0-9]+$", hadoop_version):

hadoop_version = "hadoop%s" % hadoop_version

if hadoop_version not in SUPPORTED_HADOOP_VERSIONS:

raise RuntimeError(

"Spark distribution of %s is not supported. Hadoop version should be "

"one of [%s]" % (hadoop_version, ", ".join(

SUPPORTED_HADOOP_VERSIONS)))

if re.match("^[0-9]+\\.[0-9]+$", hive_version):

hive_version = "hive%s" % hive_version

if hive_version not in SUPPORTED_HIVE_VERSIONS:

raise RuntimeError(

"Spark distribution of %s is not supported. Hive version should be "

"one of [%s]" % (hive_version, ", ".join(

SUPPORTED_HADOOP_VERSIONS)))

return spark_version, hadoop_version, hive_version

def install_spark(dest, spark_version, hadoop_version, hive_version):

"""

Installs Spark that corresponds to the given Hadoop version in the current

library directory.

Parameters

----------

dest : str

The location to download and install the Spark.

spark_version : str

Spark version. It should be spark-X.X.X form.

hadoop_version : str

Hadoop version. It should be hadoopX.X

such as 'hadoop2.7' or 'without-hadoop'.

hive_version : str

Hive version. It should be hiveX.X such as 'hive2.3'.

"""

package_name = checked_package_name(spark_version, hadoop_version, hive_version)

package_local_path = os.path.join(dest, "%s.tgz" % package_name)

109 ↛ 110line 109 didn't jump to line 110, because the condition on line 109 was never true if "PYSPARK_RELEASE_MIRROR" in os.environ:

sites = [os.environ["PYSPARK_RELEASE_MIRROR"]]

else:

sites = get_preferred_mirrors()

print("Trying to download Spark %s from [%s]" % (spark_version, ", ".join(sites)))

pretty_pkg_name = "%s for Hadoop %s" % (

spark_version,

"Free build" if hadoop_version == "without" else hadoop_version)

119 ↛ 146line 119 didn't jump to line 146, because the loop on line 119 didn't complete for site in sites:

os.makedirs(dest, exist_ok=True)

url = "%s/spark/%s/%s.tgz" % (site, spark_version, package_name)

tar = None

try:

print("Downloading %s from:\n- %s" % (pretty_pkg_name, url))

download_to_file(urllib.request.urlopen(url), package_local_path)

print("Installing to %s" % dest)

tar = tarfile.open(package_local_path, "r:gz")

for member in tar.getmembers():

if member.name == package_name:

# Skip the root directory.

continue

member.name = os.path.relpath(member.name, package_name + os.path.sep)

tar.extract(member, dest)

return

except Exception:

print("Failed to download %s from %s:" % (pretty_pkg_name, url))

traceback.print_exc()

rmtree(dest, ignore_errors=True)

finally:

if tar is not None:

tar.close()

144 ↛ exitline 144 didn't return from function 'install_spark', because the return on line 136 wasn't executed if os.path.exists(package_local_path):

145 ↛ 119line 145 didn't jump to line 119 os.remove(package_local_path)

raise IOError("Unable to download %s." % pretty_pkg_name)

def get_preferred_mirrors():

mirror_urls = []

for _ in range(3):

try:

response = urllib.request.urlopen(

"https://www.apache.org/dyn/closer.lua?preferred=true")

mirror_urls.append(response.read().decode('utf-8'))

except Exception:

# If we can't get a mirror URL, skip it. No retry.

pass

default_sites = [

"https://archive.apache.org/dist", "https://dist.apache.org/repos/dist/release"]

return list(set(mirror_urls)) + default_sites

def download_to_file(response, path, chunk_size=1024 * 1024):

total_size = int(response.info().get('Content-Length').strip())

bytes_so_far = 0

with open(path, mode="wb") as dest:

while True:

chunk = response.read(chunk_size)

bytes_so_far += len(chunk)

if not chunk:

break

dest.write(chunk)

print("Downloaded %d of %d bytes (%0.2f%%)" % (

bytes_so_far,

total_size,

round(float(bytes_so_far) / total_size * 100, 2)))

Coverage for pyspark/install.py : 93%

82 statements 78 run 4 missing 0 excluded 4 partial