Merge remote-tracking branch 'apache-github/master' into graceful-shu…

…tdown Conflicts: streaming/src/main/scala/org/apache/spark/streaming/StreamingContext.scala streaming/src/main/scala/org/apache/spark/streaming/api/java/JavaStreamingContext.scala streaming/src/main/scala/org/apache/spark/streaming/dstream/NetworkInputDStream.scala streaming/src/main/scala/org/apache/spark/streaming/scheduler/NetworkInputTracker.scala
apache · Apr 8, 2014 · ae1d39b · ae1d39b
2 parents 6b59cfc + 0307db0
commit ae1d39b
Show file tree

Hide file tree

Showing 449 changed files with 13,295 additions and 2,594 deletions.
diff --git a/.gitignore b/.gitignore
@@ -47,3 +47,4 @@ spark-*-bin.tar.gz
 unit-tests.log
 /lib/
 rat-results.txt
+scalastyle.txt
diff --git a/.rat-excludes b/.rat-excludes
@@ -39,3 +39,4 @@ work
 .*\.q
 golden
 test.out/*
+.*iml
diff --git a/.travis.yml b/.travis.yml
@@ -20,18 +20,13 @@
    - oraclejdk7
  env:
   matrix:
-   - TEST=sql/test
+   - TEST="scalastyle assembly/assembly"
+   - TEST="catalyst/test sql/test streaming/test mllib/test graphx/test bagel/test"
    - TEST=hive/test
-   - TEST=catalyst/test
-   - TEST=streaming/test
-   - TEST=graphx/test
-   - TEST=mllib/test
-   - TEST=graphx/test
-   - TEST=bagel/test
  cache:
    directories:
      - $HOME/.m2
      - $HOME/.ivy2
      - $HOME/.sbt
  script:
-   - "sbt ++$TRAVIS_SCALA_VERSION scalastyle $TEST"
+   - "sbt ++$TRAVIS_SCALA_VERSION $TEST"
diff --git a/assembly/pom.xml b/assembly/pom.xml
@@ -163,6 +163,16 @@
         </dependency>
       </dependencies>
     </profile>
+    <profile>
+      <id>hive</id>
+      <dependencies>
+        <dependency>
+          <groupId>org.apache.spark</groupId>
+          <artifactId>spark-hive_${scala.binary.version}</artifactId>
+          <version>${project.version}</version>
+        </dependency>
+      </dependencies>
+    </profile>
     <profile>
       <id>spark-ganglia-lgpl</id>
       <dependencies>
@@ -208,7 +218,7 @@
           <plugin>
             <groupId>org.codehaus.mojo</groupId>
             <artifactId>buildnumber-maven-plugin</artifactId>
-            <version>1.1</version>
+            <version>1.2</version>
             <executions>
               <execution>
                 <phase>validate</phase>

diff --git a/bin/compute-classpath.sh b/bin/compute-classpath.sh
@@ -30,22 +30,7 @@ FWDIR="$(cd `dirname $0`/..; pwd)"
 # Build up classpath
 CLASSPATH="$SPARK_CLASSPATH:$FWDIR/conf"
 
-# Support for interacting with Hive.  Since hive pulls in a lot of dependencies that might break
-# existing Spark applications, it is not included in the standard spark assembly.  Instead, we only
-# include it in the classpath if the user has explicitly requested it by running "sbt hive/assembly"
-# Hopefully we will find a way to avoid uber-jars entirely and deploy only the needed packages in
-# the future.
-if [ -f "$FWDIR"/sql/hive/target/scala-$SCALA_VERSION/spark-hive-assembly-*.jar ]; then
-  echo "Hive assembly found, including hive support.  If this isn't desired run sbt hive/clean."
-
-  # Datanucleus jars do not work if only included in the uberjar as plugin.xml metadata is lost.
-  DATANUCLEUSJARS=$(JARS=("$FWDIR/lib_managed/jars"/datanucleus-*.jar); IFS=:; echo "${JARS[*]}")
-  CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
-
-  ASSEMBLY_DIR="$FWDIR/sql/hive/target/scala-$SCALA_VERSION/"
-else
-  ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION/"
-fi
+ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION"
 
 # First check if we have a dependencies jar. If so, include binary classes with the deps jar
 if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
@@ -60,7 +45,7 @@ if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
   CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
 
-  DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark*-assembly*hadoop*-deps.jar`
+  DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar`
   CLASSPATH="$CLASSPATH:$DEPS_ASSEMBLY_JAR"
 else
   # Else use spark-assembly jar from either RELEASE or assembly directory
@@ -72,6 +57,23 @@ else
   CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR"
 fi
 
+# When Hive support is needed, Datanucleus jars must be included on the classpath.
+# Datanucleus jars do not work if only included in the  uber jar as plugin.xml metadata is lost.
+# Both sbt and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is
+# built with Hive, so first check if the datanucleus jars exist, and then ensure the current Spark
+# assembly is built for Hive, before actually populating the CLASSPATH with the jars.
+# Note that this check order is faster (by up to half a second) in the case where Hive is not used.
+num_datanucleus_jars=$(ls "$FWDIR"/lib_managed/jars/ | grep "datanucleus-.*\\.jar" | wc -l)
+if [ $num_datanucleus_jars -gt 0 ]; then
+  AN_ASSEMBLY_JAR=${ASSEMBLY_JAR:-$DEPS_ASSEMBLY_JAR}
+  num_hive_files=$(jar tvf "$AN_ASSEMBLY_JAR" org/apache/hadoop/hive/ql/exec 2>/dev/null | wc -l)
+  if [ $num_hive_files -gt 0 ]; then
+    echo "Spark assembly has been built with Hive, including Datanucleus jars on classpath" 1>&2
+    DATANUCLEUSJARS=$(echo "$FWDIR/lib_managed/jars"/datanucleus-*.jar | tr " " :)
+    CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
+  fi
+fi
+
 # Add test classes if we're running from SBT or Maven with SPARK_TESTING set to 1
 if [[ $SPARK_TESTING == 1 ]]; then
   CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/test-classes"

diff --git a/bin/load-spark-env.sh b/bin/load-spark-env.sh
@@ -30,6 +30,9 @@ if [ -z "$SPARK_ENV_LOADED" ]; then
   use_conf_dir=${SPARK_CONF_DIR:-"$parent_dir/conf"}
 
   if [ -f "${use_conf_dir}/spark-env.sh" ]; then
+    # Promote all variable declarations to environment (exported) variables
+    set -a
     . "${use_conf_dir}/spark-env.sh"
+    set +a
   fi
 fi
diff --git a/bin/pyspark b/bin/pyspark
@@ -55,7 +55,8 @@ if [ -n "$IPYTHON_OPTS" ]; then
   IPYTHON=1
 fi
 
-if [[ "$IPYTHON" = "1" ]] ; then
+# Only use ipython if no command line arguments were provided [SPARK-1134]
+if [[ "$IPYTHON" = "1" && $# = 0 ]] ; then
   exec ipython $IPYTHON_OPTS
 else
   exec "$PYSPARK_PYTHON" "$@"

diff --git a/bin/spark-class b/bin/spark-class
@@ -154,5 +154,3 @@ if [ "$SPARK_PRINT_LAUNCH_COMMAND" == "1" ]; then
 fi
 
 exec "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"
-
-
diff --git a/bin/spark-shell b/bin/spark-shell
@@ -30,67 +30,189 @@ esac
 # Enter posix mode for bash
 set -o posix
 
-CORE_PATTERN="^[0-9]+$"
-MEM_PATTERN="^[0-9]+[m|g|M|G]$"
-
+## Global script variables
 FWDIR="$(cd `dirname $0`/..; pwd)"
 
-if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then
-	echo "Usage: spark-shell [OPTIONS]"
-	echo "OPTIONS:"
-	echo "-c --cores num, the maximum number of cores to be used by the spark shell"
-	echo "-em --execmem num[m|g], the memory used by each executor of spark shell"
-	echo "-dm --drivermem num[m|g], the memory used by the spark shell and driver"
-	echo "-h --help, print this help information" 
-	exit
-fi
+SPARK_REPL_OPTS="${SPARK_REPL_OPTS:-""}"
+DEFAULT_MASTER="local[*]"
+MASTER=${MASTER:-""}
+
+info_log=0
+
+#CLI Color Templates
+txtund=$(tput sgr 0 1)          # Underline
+txtbld=$(tput bold)             # Bold
+bldred=${txtbld}$(tput setaf 1) # red
+bldyel=${txtbld}$(tput setaf 3) # yellow
+bldblu=${txtbld}$(tput setaf 4) # blue
+bldwht=${txtbld}$(tput setaf 7) # white
+txtrst=$(tput sgr0)             # Reset
+info=${bldwht}*${txtrst}        # Feedback
+pass=${bldblu}*${txtrst}
+warn=${bldred}*${txtrst}
+ques=${bldblu}?${txtrst}
+
+# Helper function to describe the script usage
+function usage() {
+    cat << EOF
+${txtbld}Usage${txtrst}: spark-shell [OPTIONS]
+
+${txtbld}OPTIONS${txtrst}:
+    -h  --help              : Print this help information.
+    -c  --cores             : The maximum number of cores to be used by the Spark Shell.
+    -em --executor-memory   : The memory used by each executor of the Spark Shell, the number 
+                              is followed by m for megabytes or g for gigabytes, e.g. "1g".
+    -dm --driver-memory     : The memory used by the Spark Shell, the number is followed 
+                              by m for megabytes or g for gigabytes, e.g. "1g".
+    -m  --master            : A full string that describes the Spark Master, defaults to "local[*]"
+                              e.g. "spark://localhost:7077".
+    --log-conf              : Enables logging of the supplied SparkConf as INFO at start of the
+                              Spark Context.
+
+e.g.
+    spark-shell -m spark://localhost:7077 -c 4 -dm 512m -em 2g
+
+EOF
+}
+
+function out_error(){
+    echo -e "${txtund}${bldred}ERROR${txtrst}: $1"
+    usage
+    exit 1
+}
+
+function log_info(){
+    [ $info_log -eq 1 ] && echo -e "${bldyel}INFO${txtrst}: $1"
+}
+
+function log_warn(){
+    echo -e "${txtund}${bldyel}WARN${txtrst}: $1"
+}
 
-for o in "$@"; do
-  if [ "$1" = "-c" -o "$1" = "--cores" ]; then
-    shift
+# PATTERNS used to validate more than one optional arg.
+ARG_FLAG_PATTERN="^-"
+MEM_PATTERN="^[0-9]+[m|g|M|G]$"
+NUM_PATTERN="^[0-9]+$"
+PORT_PATTERN="^[0-9]+$"
+
+# Setters for optional args.
+function set_cores(){
+    CORE_PATTERN="^[0-9]+$"
     if [[ "$1" =~ $CORE_PATTERN ]]; then
-      SPARK_REPL_OPTS="$SPARK_REPL_OPTS -Dspark.cores.max=$1"
-      shift
+        SPARK_REPL_OPTS="$SPARK_REPL_OPTS -Dspark.cores.max=$1"
     else
-      echo "ERROR: wrong format for -c/--cores"
-      exit 1
+        out_error "wrong format for $2"
     fi
-  fi
-  if [ "$1" = "-em" -o "$1" = "--execmem" ]; then
-    shift
+}
+
+function set_em(){
     if [[ $1 =~ $MEM_PATTERN ]]; then
       SPARK_REPL_OPTS="$SPARK_REPL_OPTS -Dspark.executor.memory=$1"
-      shift
     else
-      echo "ERROR: wrong format for --execmem/-em"
-      exit 1
+      out_error "wrong format for $2"
     fi
-  fi
-  if [ "$1" = "-dm" -o "$1" = "--drivermem" ]; then
-    shift
+}
+
+function set_dm(){
     if [[ $1 =~ $MEM_PATTERN ]]; then
       export SPARK_DRIVER_MEMORY=$1
-      shift
     else
-      echo "ERROR: wrong format for --drivermem/-dm"
-      exit 1
+      out_error "wrong format for $2"
     fi
-  fi
-done
+}
+
+function set_spark_log_conf(){
+    SPARK_REPL_OPTS="$SPARK_REPL_OPTS -Dspark.logConf=$1"
+}
 
-# Set MASTER from spark-env if possible
-DEFAULT_SPARK_MASTER_PORT=7077
-if [ -z "$MASTER" ]; then
-  . $FWDIR/bin/load-spark-env.sh
-  if [ "x" != "x$SPARK_MASTER_IP" ]; then
-    if [ "y" != "y$SPARK_MASTER_PORT" ]; then
-      SPARK_MASTER_PORT="${SPARK_MASTER_PORT}"
+function set_spark_master(){
+    if ! [[ "$1" =~ $ARG_FLAG_PATTERN ]]; then
+        export MASTER="$1"
     else
-      SPARK_MASTER_PORT=$DEFAULT_SPARK_MASTER_PORT
+        out_error "wrong format for $2"
+    fi
+}
+
+function resolve_spark_master(){
+    # Set MASTER from spark-env if possible
+    DEFAULT_SPARK_MASTER_PORT=7077
+    if [ -z "$MASTER" ]; then
+        . $FWDIR/bin/load-spark-env.sh
+        if [ -n "$SPARK_MASTER_IP" ]; then
+            SPARK_MASTER_PORT="${SPARK_MASTER_PORT:-"$DEFAULT_SPARK_MASTER_PORT"}"
+            export MASTER="spark://${SPARK_MASTER_IP}:${SPARK_MASTER_PORT}"
+        fi
+    fi
+
+    if [ -z "$MASTER" ]; then
+        export MASTER="$DEFAULT_MASTER"
     fi
-    export MASTER="spark://${SPARK_MASTER_IP}:${SPARK_MASTER_PORT}"
-  fi
-fi
+
+}
+
+function main(){
+    log_info "Base Directory set to $FWDIR"
+
+    resolve_spark_master
+    log_info "Spark Master is $MASTER"
+
+    log_info "Spark REPL options  $SPARK_REPL_OPTS"
+    if $cygwin; then
+        # Workaround for issue involving JLine and Cygwin
+        # (see http://sourceforge.net/p/jline/bugs/40/).
+        # If you're using the Mintty terminal emulator in Cygwin, may need to set the
+        # "Backspace sends ^H" setting in "Keys" section of the Mintty options
+        # (see https://github.com/sbt/sbt/issues/562).
+        stty -icanon min 1 -echo > /dev/null 2>&1
+        export SPARK_REPL_OPTS="$SPARK_REPL_OPTS -Djline.terminal=unix"
+        $FWDIR/bin/spark-class org.apache.spark.repl.Main "$@"
+        stty icanon echo > /dev/null 2>&1
+    else
+        export SPARK_REPL_OPTS
+        $FWDIR/bin/spark-class org.apache.spark.repl.Main "$@"
+    fi
+}
+
+for option in "$@"
+do
+     case $option in
+         -h  | --help )
+             usage
+             exit 1
+             ;;
+         -c  | --cores)
+             shift
+             _1=$1
+             shift
+             set_cores $_1 "-c/--cores"
+             ;;
+         -em | --executor-memory)
+             shift
+             _1=$1
+             shift
+             set_em $_1 "-em/--executor-memory"
+             ;;
+         -dm | --driver-memory)
+             shift
+             _1=$1
+             shift
+             set_dm $_1 "-dm/--driver-memory"
+             ;;
+         -m | --master)
+             shift
+             _1=$1
+             shift
+             set_spark_master $_1 "-m/--master"
+             ;;
+         --log-conf)
+             shift
+             set_spark_log_conf "true"
+             info_log=1
+             ;;
+         ?)
+             ;;
+     esac
+done
 
 # Copy restore-TTY-on-exit functions from Scala script so spark-shell exits properly even in
 # binary distribution of Spark where Scala is not installed
@@ -120,22 +242,10 @@ if [[ ! $? ]]; then
   saved_stty=""
 fi
 
-if $cygwin; then
-  # Workaround for issue involving JLine and Cygwin
-  # (see http://sourceforge.net/p/jline/bugs/40/).
-  # If you're using the Mintty terminal emulator in Cygwin, may need to set the
-  # "Backspace sends ^H" setting in "Keys" section of the Mintty options
-  # (see https://github.com/sbt/sbt/issues/562).
-  stty -icanon min 1 -echo > /dev/null 2>&1
-  export SPARK_REPL_OPTS="$SPARK_REPL_OPTS -Djline.terminal=unix"
-  $FWDIR/bin/spark-class org.apache.spark.repl.Main "$@"
-  stty icanon echo > /dev/null 2>&1
-else
-  export SPARK_REPL_OPTS
-  $FWDIR/bin/spark-class org.apache.spark.repl.Main "$@"
-fi
+main
 
 # record the exit status lest it be overwritten:
 # then reenable echo and propagate the code.
 exit_status=$?
 onExit
+