From 0447c9fc563ef4bc02f937bfea63ea1d62f252cf Mon Sep 17 00:00:00 2001 From: Reynold Xin Date: Fri, 5 Sep 2014 23:59:34 -0700 Subject: [PATCH] Removed sample code. --- core/pom.xml | 2 +- docs/openstack-integration.md | 131 +++------------------------------- 2 files changed, 10 insertions(+), 123 deletions(-) diff --git a/core/pom.xml b/core/pom.xml index 746862892f074..55bfe0b841ea4 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -44,7 +44,7 @@ - + net.java.dev.jets3t jets3t diff --git a/docs/openstack-integration.md b/docs/openstack-integration.md index ac5b5a34a141c..ff3cf95ac2f0b 100644 --- a/docs/openstack-integration.md +++ b/docs/openstack-integration.md @@ -1,6 +1,6 @@ --- layout: global -title: OpenStack Integration +title: OpenStack Swift Integration --- * This will become a table of contents (this text will be scraped). @@ -9,16 +9,12 @@ title: OpenStack Integration # Accessing OpenStack Swift from Spark -Spark's file interface allows it to process data in OpenStack Swift using the same URI -formats that are supported for Hadoop. You can specify a path in Swift as input through a -URI of the form swift://. You will also need to set your +Spark's support for Hadoop InputFormat allows it to process data in OpenStack Swift using the +same URI formats as in Hadoop. You can specify a path in Swift as input through a +URI of the form swift://container.PROVIDER/path. You will also need to set your Swift security credentials, through core-sites.xml or via -SparkContext.hadoopConfiguration. -Openstack Swift driver was merged in Hadoop version 2.3.0 -([Swift driver](https://issues.apache.org/jira/browse/HADOOP-8545)). -Users that wish to use previous Hadoop versions will need to configure Swift driver manually. -Current Swift driver requires Swift to use Keystone authentication method. There are recent efforts -to support temp auth [Hadoop-10420](https://issues.apache.org/jira/browse/HADOOP-10420). +SparkContext.hadoopConfiguration. +Current Swift driver requires Swift to use Keystone authentication method. # Configuring Swift Proxy server of Swift should include list_endpoints middleware. More information @@ -27,9 +23,9 @@ available # Dependencies -Spark should be compiled with hadoop-openstack-2.3.0.jar that is distributted with -Hadoop 2.3.0. For the Maven builds, the dependencyManagement section of Spark's main -pom.xml should include: +The Spark application should include hadoop-openstack dependency. +For example, for Maven support, add the following to the pom.xml file: + {% highlight xml %} ... @@ -42,19 +38,6 @@ Hadoop 2.3.0. For the Maven builds, the dependencyManagement sectio {% endhighlight %} -In addition, both core and yarn projects should add -hadoop-openstack to the dependencies section of their -pom.xml: -{% highlight xml %} - - ... - - org.apache.hadoop - hadoop-openstack - - ... - -{% endhighlight %} # Configuration Parameters @@ -171,99 +154,3 @@ Notice that We suggest to keep those parameters in core-sites.xml for testing purposes when running Spark via spark-shell. For job submissions they should be provided via sparkContext.hadoopConfiguration. - -# Usage examples - -Assume Keystone's authentication URL is http://127.0.0.1:5000/v2.0/tokens and Keystone contains tenant test, user tester with password testing. In our example we define PROVIDER=SparkTest. Assume that Swift contains container logs with an object data.log. To access data.log from Spark the swift:// scheme should be used. - - -## Running Spark via spark-shell - -Make sure that core-sites.xml contains fs.swift.service.SparkTest.tenant, fs.swift.service.SparkTest.username, -fs.swift.service.SparkTest.password. Run Spark via spark-shell and access Swift via swift:// scheme. - -{% highlight scala %} -val sfdata = sc.textFile("swift://logs.SparkTest/data.log") -sfdata.count() -{% endhighlight %} - - -## Sample Application - -In this case core-sites.xml need not contain fs.swift.service.SparkTest.tenant, fs.swift.service.SparkTest.username, -fs.swift.service.SparkTest.password. Example of Java usage: - -{% highlight java %} -/* SimpleApp.java */ -import org.apache.spark.api.java.*; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.function.Function; - -public class SimpleApp { - public static void main(String[] args) { - String logFile = "swift://logs.SparkTest/data.log"; - SparkConf conf = new SparkConf().setAppName("Simple Application"); - JavaSparkContext sc = new JavaSparkContext(conf); - sc.hadoopConfiguration().set("fs.swift.service.ibm.tenant", "test"); - sc.hadoopConfiguration().set("fs.swift.service.ibm.password", "testing"); - sc.hadoopConfiguration().set("fs.swift.service.ibm.username", "tester"); - - JavaRDD logData = sc.textFile(logFile).cache(); - long num = logData.count(); - - System.out.println("Total number of lines: " + num); - } -} -{% endhighlight %} - -The directory structure is -{% highlight bash %} -./src -./src/main -./src/main/java -./src/main/java/SimpleApp.java -{% endhighlight %} - -Maven pom.xml should contain: -{% highlight xml %} - - edu.berkeley - simple-project - 4.0.0 - Simple Project - jar - 1.0 - - - Akka repository - http://repo.akka.io/releases - - - - - - org.apache.maven.plugins - maven-compiler-plugin - 2.3 - - 1.6 - 1.6 - - - - - - - org.apache.spark - spark-core_2.10 - 1.0.0 - - - -{% endhighlight %} - -Compile and execute -{% highlight bash %} -mvn package -SPARK_HOME/spark-submit --class SimpleApp --master local[4] target/simple-project-1.0.jar -{% endhighlight %}