Skip to content

Commit

Permalink
Merge pull request #1 from rxin/openstack
Browse files Browse the repository at this point in the history
Bring the branch up to date and fixed some documentation typos.
  • Loading branch information
gilv committed Jun 17, 2014
2 parents cca7192 + ac0679e commit 6994827
Show file tree
Hide file tree
Showing 464 changed files with 16,905 additions and 5,833 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
sbt/*.jar
.settings
.cache
.mima-excludes
.generated-mima*
/build/
work/
out/
Expand Down
1 change: 1 addition & 0 deletions .rat-excludes
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ target
.project
.classpath
.mima-excludes
.generated-mima-excludes
.rat-excludes
.*md
derby.log
Expand Down
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@ You can find the latest Spark documentation, including a programming
guide, on the project webpage at <http://spark.apache.org/documentation.html>.
This README file only contains basic setup instructions.


## Building Spark

Spark is built on Scala 2.10. To build Spark and its example programs, run:

./sbt/sbt assembly

(You do not need to do this if you downloaded a pre-built package.)

## Interactive Scala Shell

The easiest way to start using Spark is through the Scala shell:
Expand All @@ -41,9 +42,9 @@ And run the following command, which should also return 1000:
Spark also comes with several sample programs in the `examples` directory.
To run one of them, use `./bin/run-example <class> [params]`. For example:

./bin/run-example org.apache.spark.examples.SparkLR
./bin/run-example SparkPi

will run the Logistic Regression example locally.
will run the Pi example locally.

You can set the MASTER environment variable when running examples to submit
examples to a cluster. This can be a mesos:// or spark:// URL,
Expand Down
4 changes: 2 additions & 2 deletions assembly/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<parent>
<groupId>org.apache.spark</groupId>
<artifactId>spark-parent</artifactId>
<version>1.0.0-SNAPSHOT</version>
<version>1.1.0-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>

Expand Down Expand Up @@ -96,7 +96,7 @@
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>org.datanucleus:*</exclude>
<exclude>org/datanucleus/**</exclude>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
Expand Down
2 changes: 1 addition & 1 deletion bagel/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
<parent>
<groupId>org.apache.spark</groupId>
<artifactId>spark-parent</artifactId>
<version>1.0.0-SNAPSHOT</version>
<version>1.1.0-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>

Expand Down
6 changes: 2 additions & 4 deletions bagel/src/test/scala/org/apache/spark/bagel/BagelSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@ class BagelSuite extends FunSuite with Assertions with BeforeAndAfter with Timeo
sc.stop()
sc = null
}
// To avoid Akka rebinding to the same port, since it doesn't unbind immediately on shutdown
System.clearProperty("spark.driver.port")
}

test("halting by voting") {
Expand Down Expand Up @@ -82,7 +80,7 @@ class BagelSuite extends FunSuite with Assertions with BeforeAndAfter with Timeo
test("large number of iterations") {
// This tests whether jobs with a large number of iterations finish in a reasonable time,
// because non-memoized recursion in RDD or DAGScheduler used to cause them to hang
failAfter(10 seconds) {
failAfter(30 seconds) {
sc = new SparkContext("local", "test")
val verts = sc.parallelize((1 to 4).map(id => (id.toString, new TestVertex(true, 0))))
val msgs = sc.parallelize(Array[(String, TestMessage)]())
Expand All @@ -103,7 +101,7 @@ class BagelSuite extends FunSuite with Assertions with BeforeAndAfter with Timeo
sc = new SparkContext("local", "test")
val verts = sc.parallelize((1 to 4).map(id => (id.toString, new TestVertex(true, 0))))
val msgs = sc.parallelize(Array[(String, TestMessage)]())
val numSupersteps = 50
val numSupersteps = 20
val result =
Bagel.run(sc, verts, msgs, sc.defaultParallelism, StorageLevel.DISK_ONLY) {
(self: TestVertex, msgs: Option[Array[TestMessage]], superstep: Int) =>
Expand Down
24 changes: 23 additions & 1 deletion bin/compute-classpath.cmd
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@ rem
rem This script computes Spark's classpath and prints it to stdout; it's used by both the "run"
rem script and the ExecutorRunner in standalone cluster mode.

rem If we're called from spark-class2.cmd, it already set enabledelayedexpansion and setting
rem it here would stop us from affecting its copy of the CLASSPATH variable; otherwise we
rem need to set it here because we use !datanucleus_jars! below.
if "%DONT_PRINT_CLASSPATH%"=="1" goto skip_delayed_expansion
setlocal enabledelayedexpansion
:skip_delayed_expansion

set SCALA_VERSION=2.10

rem Figure out where the Spark framework is installed
Expand All @@ -31,7 +38,7 @@ if exist "%FWDIR%conf\spark-env.cmd" call "%FWDIR%conf\spark-env.cmd"
rem Build up classpath
set CLASSPATH=%FWDIR%conf
if exist "%FWDIR%RELEASE" (
for %%d in ("%FWDIR%jars\spark-assembly*.jar") do (
for %%d in ("%FWDIR%lib\spark-assembly*.jar") do (
set ASSEMBLY_JAR=%%d
)
) else (
Expand All @@ -42,6 +49,21 @@ if exist "%FWDIR%RELEASE" (

set CLASSPATH=%CLASSPATH%;%ASSEMBLY_JAR%

rem When Hive support is needed, Datanucleus jars must be included on the classpath.
rem Datanucleus jars do not work if only included in the uber jar as plugin.xml metadata is lost.
rem Both sbt and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is
rem built with Hive, so look for them there.
if exist "%FWDIR%RELEASE" (
set datanucleus_dir=%FWDIR%lib
) else (
set datanucleus_dir=%FWDIR%lib_managed\jars
)
set "datanucleus_jars="
for %%d in ("%datanucleus_dir%\datanucleus-*.jar") do (
set datanucleus_jars=!datanucleus_jars!;%%d
)
set CLASSPATH=%CLASSPATH%;%datanucleus_jars%

set SPARK_CLASSES=%FWDIR%core\target\scala-%SCALA_VERSION%\classes
set SPARK_CLASSES=%SPARK_CLASSES%;%FWDIR%repl\target\scala-%SCALA_VERSION%\classes
set SPARK_CLASSES=%SPARK_CLASSES%;%FWDIR%mllib\target\scala-%SCALA_VERSION%\classes
Expand Down
34 changes: 25 additions & 9 deletions bin/compute-classpath.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,10 @@ else
JAR_CMD="jar"
fi

# First check if we have a dependencies jar. If so, include binary classes with the deps jar
if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
# A developer option to prepend more recently compiled Spark classes
if [ -n "$SPARK_PREPEND_CLASSES" ]; then
echo "NOTE: SPARK_PREPEND_CLASSES is set, placing locally compiled Spark"\
"classes ahead of assembly." >&2
CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/repl/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/mllib/target/scala-$SCALA_VERSION/classes"
Expand All @@ -51,17 +53,31 @@ if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/yarn/stable/target/scala-$SCALA_VERSION/classes"
fi

ASSEMBLY_JAR=$(ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar 2>/dev/null)
# Use spark-assembly jar from either RELEASE or assembly directory
if [ -f "$FWDIR/RELEASE" ]; then
assembly_folder="$FWDIR"/lib
else
# Else use spark-assembly jar from either RELEASE or assembly directory
if [ -f "$FWDIR/RELEASE" ]; then
ASSEMBLY_JAR=$(ls "$FWDIR"/lib/spark-assembly*hadoop*.jar 2>/dev/null)
else
ASSEMBLY_JAR=$(ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*.jar 2>/dev/null)
fi
assembly_folder="$ASSEMBLY_DIR"
fi

num_jars=$(ls "$assembly_folder" | grep "spark-assembly.*hadoop.*\.jar" | wc -l)
if [ "$num_jars" -eq "0" ]; then
echo "Failed to find Spark assembly in $assembly_folder"
echo "You need to build Spark before running this program."
exit 1
fi
if [ "$num_jars" -gt "1" ]; then
jars_list=$(ls "$assembly_folder" | grep "spark-assembly.*hadoop.*.jar")
echo "Found multiple Spark assembly jars in $assembly_folder:"
echo "$jars_list"
echo "Please remove all but one jar."
exit 1
fi

ASSEMBLY_JAR=$(ls "$assembly_folder"/spark-assembly*hadoop*.jar 2>/dev/null)

# Verify that versions of java used to build the jars and run Spark are compatible
jar_error_check=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" nonexistent/class/path 2>&1)
if [[ "$jar_error_check" =~ "invalid CEN header" ]]; then
Expand Down
16 changes: 13 additions & 3 deletions bin/pyspark
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
# limitations under the License.
#

# Figure out where the Scala framework is installed
# Figure out where Spark is installed
FWDIR="$(cd `dirname $0`/..; pwd)"

# Export this as SPARK_HOME
Expand Down Expand Up @@ -45,7 +45,7 @@ fi
. $FWDIR/bin/load-spark-env.sh

# Figure out which Python executable to use
if [ -z "$PYSPARK_PYTHON" ] ; then
if [[ -z "$PYSPARK_PYTHON" ]]; then
PYSPARK_PYTHON="python"
fi
export PYSPARK_PYTHON
Expand All @@ -59,7 +59,7 @@ export OLD_PYTHONSTARTUP=$PYTHONSTARTUP
export PYTHONSTARTUP=$FWDIR/python/pyspark/shell.py

# If IPython options are specified, assume user wants to run IPython
if [ -n "$IPYTHON_OPTS" ]; then
if [[ -n "$IPYTHON_OPTS" ]]; then
IPYTHON=1
fi

Expand All @@ -76,6 +76,16 @@ for i in "$@"; do
done
export PYSPARK_SUBMIT_ARGS

# For pyspark tests
if [[ -n "$SPARK_TESTING" ]]; then
if [[ -n "$PYSPARK_DOC_TEST" ]]; then
exec "$PYSPARK_PYTHON" -m doctest $1
else
exec "$PYSPARK_PYTHON" $1
fi
exit
fi

# If a python file is provided, directly run spark-submit.
if [[ "$1" =~ \.py$ ]]; then
echo -e "\nWARNING: Running python applications through ./bin/pyspark is deprecated as of Spark 1.0." 1>&2
Expand Down
25 changes: 12 additions & 13 deletions bin/run-example
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,16 @@ FWDIR="$(cd `dirname $0`/..; pwd)"
export SPARK_HOME="$FWDIR"
EXAMPLES_DIR="$FWDIR"/examples

if [ -n "$1" ]; then
EXAMPLE_CLASS="$1"
shift
else
echo "Usage: ./bin/run-example <example-class> [example-args]"
echo " - set MASTER=XX to use a specific master"
echo " - can use abbreviated example class name (e.g. SparkPi, mllib.LinearRegression)"
exit 1
fi

if [ -f "$FWDIR/RELEASE" ]; then
export SPARK_EXAMPLES_JAR=`ls "$FWDIR"/lib/spark-examples-*hadoop*.jar`
elif [ -e "$EXAMPLES_DIR"/target/scala-$SCALA_VERSION/spark-examples-*hadoop*.jar ]; then
Expand All @@ -37,23 +47,12 @@ fi

EXAMPLE_MASTER=${MASTER:-"local[*]"}

if [ -n "$1" ]; then
EXAMPLE_CLASS="$1"
shift
else
echo "usage: ./bin/run-example <example-class> [example-args]"
echo " - set MASTER=XX to use a specific master"
echo " - can use abbreviated example class name (e.g. SparkPi, mllib.MovieLensALS)"
echo
exit -1
fi

if [[ ! $EXAMPLE_CLASS == org.apache.spark.examples* ]]; then
EXAMPLE_CLASS="org.apache.spark.examples.$EXAMPLE_CLASS"
fi

./bin/spark-submit \
"$FWDIR"/bin/spark-submit \
--master $EXAMPLE_MASTER \
--class $EXAMPLE_CLASS \
$SPARK_EXAMPLES_JAR \
"$SPARK_EXAMPLES_JAR" \
"$@"
51 changes: 39 additions & 12 deletions bin/run-example2.cmd
Original file line number Diff line number Diff line change
Expand Up @@ -30,32 +30,59 @@ if exist "%FWDIR%conf\spark-env.cmd" call "%FWDIR%conf\spark-env.cmd"

rem Test that an argument was given
if not "x%1"=="x" goto arg_given
echo Usage: run-example ^<example-class^> [^<args^>]
echo Usage: run-example ^<example-class^> [example-args]
echo - set MASTER=XX to use a specific master
echo - can use abbreviated example class name (e.g. SparkPi, mllib.LinearRegression)
goto exit
:arg_given

set EXAMPLES_DIR=%FWDIR%examples

rem Figure out the JAR file that our examples were packaged into.
set SPARK_EXAMPLES_JAR=
for %%d in ("%EXAMPLES_DIR%\target\scala-%SCALA_VERSION%\spark-examples*assembly*.jar") do (
set SPARK_EXAMPLES_JAR=%%d
if exist "%FWDIR%RELEASE" (
for %%d in ("%FWDIR%lib\spark-examples*.jar") do (
set SPARK_EXAMPLES_JAR=%%d
)
) else (
for %%d in ("%EXAMPLES_DIR%\target\scala-%SCALA_VERSION%\spark-examples*.jar") do (
set SPARK_EXAMPLES_JAR=%%d
)
)
if "x%SPARK_EXAMPLES_JAR%"=="x" (
echo Failed to find Spark examples assembly JAR.
echo You need to build Spark with sbt\sbt assembly before running this program.
goto exit
)

rem Compute Spark classpath using external script
set DONT_PRINT_CLASSPATH=1
call "%FWDIR%bin\compute-classpath.cmd"
set DONT_PRINT_CLASSPATH=0
set CLASSPATH=%SPARK_EXAMPLES_JAR%;%CLASSPATH%
rem Set master from MASTER environment variable if given
if "x%MASTER%"=="x" (
set EXAMPLE_MASTER=local[*]
) else (
set EXAMPLE_MASTER=%MASTER%
)

rem If the EXAMPLE_CLASS does not start with org.apache.spark.examples, add that
set EXAMPLE_CLASS=%1
set PREFIX=%EXAMPLE_CLASS:~0,25%
if not %PREFIX%==org.apache.spark.examples (
set EXAMPLE_CLASS=org.apache.spark.examples.%EXAMPLE_CLASS%
)

rem Get the tail of the argument list, to skip the first one. This is surprisingly
rem complicated on Windows.
set "ARGS="
:top
shift
if "%~1" neq "" (
set ARGS=%ARGS% "%~1"
goto :top
)
if defined ARGS set ARGS=%ARGS:~1%

rem Figure out where java is.
set RUNNER=java
if not "x%JAVA_HOME%"=="x" set RUNNER=%JAVA_HOME%\bin\java
call "%FWDIR%bin\spark-submit.cmd" ^
--master %EXAMPLE_MASTER% ^
--class %EXAMPLE_CLASS% ^
"%SPARK_EXAMPLES_JAR%" %ARGS%

"%RUNNER%" -cp "%CLASSPATH%" %JAVA_OPTS% %*
:exit
23 changes: 3 additions & 20 deletions bin/spark-class
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ esac

SCALA_VERSION=2.10

# Figure out where the Scala framework is installed
# Figure out where Spark is installed
FWDIR="$(cd `dirname $0`/..; pwd)"

# Export this as SPARK_HOME
Expand Down Expand Up @@ -99,31 +99,14 @@ else
fi

# Set JAVA_OPTS to be able to load native libraries and to set heap size
JAVA_OPTS="$OUR_JAVA_OPTS"
JAVA_OPTS="-XX:MaxPermSize=128m $OUR_JAVA_OPTS"
JAVA_OPTS="$JAVA_OPTS -Xms$OUR_JAVA_MEM -Xmx$OUR_JAVA_MEM"
# Load extra JAVA_OPTS from conf/java-opts, if it exists
if [ -e "$FWDIR/conf/java-opts" ] ; then
JAVA_OPTS="$JAVA_OPTS `cat $FWDIR/conf/java-opts`"
fi
export JAVA_OPTS
# Attention: when changing the way the JAVA_OPTS are assembled, the change must be reflected in ExecutorRunner.scala!

if [ ! -f "$FWDIR/RELEASE" ]; then
# Exit if the user hasn't compiled Spark
num_jars=$(ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/ | grep "spark-assembly.*hadoop.*.jar" | wc -l)
jars_list=$(ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/ | grep "spark-assembly.*hadoop.*.jar")
if [ "$num_jars" -eq "0" ]; then
echo "Failed to find Spark assembly in $FWDIR/assembly/target/scala-$SCALA_VERSION/" >&2
echo "You need to build Spark before running this program." >&2
exit 1
fi
if [ "$num_jars" -gt "1" ]; then
echo "Found multiple Spark assembly jars in $FWDIR/assembly/target/scala-$SCALA_VERSION:" >&2
echo "$jars_list"
echo "Please remove all but one jar."
exit 1
fi
fi
# Attention: when changing the way the JAVA_OPTS are assembled, the change must be reflected in CommandUtils.scala!

TOOLS_DIR="$FWDIR"/tools
SPARK_TOOLS_JAR=""
Expand Down
Loading

0 comments on commit 6994827

Please sign in to comment.