Skip to content

Commit

Permalink
[SPARK-9812] [STREAMING] Fix Python 3 compatibility issue in PySpark …
Browse files Browse the repository at this point in the history
…Streaming and some docs

This PR includes the following fixes:
1. Use `range` instead of `xrange` in `queue_stream.py` to support Python 3.
2. Fix the issue that `utf8_decoder` will return `bytes` rather than `str` when receiving an empty `bytes` in Python 3.
3. Fix the commands in docs so that the user can copy them directly to the command line. The previous commands was broken in the middle of a path, so when copying to the command line, the path would be split to two parts by the extra spaces, which forces the user to fix it manually.

Author: zsxwing <[email protected]>

Closes apache#8315 from zsxwing/SPARK-9812.
  • Loading branch information
zsxwing authored and tdas committed Aug 20, 2015
1 parent 2f2686a commit 1f29d50
Show file tree
Hide file tree
Showing 8 changed files with 23 additions and 14 deletions.
6 changes: 3 additions & 3 deletions examples/src/main/python/streaming/direct_kafka_wordcount.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@
http://kafka.apache.org/documentation.html#quickstart
and then run the example
`$ bin/spark-submit --jars external/kafka-assembly/target/scala-*/\
spark-streaming-kafka-assembly-*.jar \
`$ bin/spark-submit --jars \
external/kafka-assembly/target/scala-*/spark-streaming-kafka-assembly-*.jar \
examples/src/main/python/streaming/direct_kafka_wordcount.py \
localhost:9092 test`
"""
Expand All @@ -37,7 +37,7 @@

if __name__ == "__main__":
if len(sys.argv) != 3:
print >> sys.stderr, "Usage: direct_kafka_wordcount.py <broker_list> <topic>"
print("Usage: direct_kafka_wordcount.py <broker_list> <topic>", file=sys.stderr)
exit(-1)

sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount")
Expand Down
5 changes: 3 additions & 2 deletions examples/src/main/python/streaming/flume_wordcount.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@
https://flume.apache.org/documentation.html
and then run the example
`$ bin/spark-submit --jars external/flume-assembly/target/scala-*/\
spark-streaming-flume-assembly-*.jar examples/src/main/python/streaming/flume_wordcount.py \
`$ bin/spark-submit --jars \
external/flume-assembly/target/scala-*/spark-streaming-flume-assembly-*.jar \
examples/src/main/python/streaming/flume_wordcount.py \
localhost 12345
"""
from __future__ import print_function
Expand Down
5 changes: 3 additions & 2 deletions examples/src/main/python/streaming/kafka_wordcount.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@
http://kafka.apache.org/documentation.html#quickstart
and then run the example
`$ bin/spark-submit --jars external/kafka-assembly/target/scala-*/\
spark-streaming-kafka-assembly-*.jar examples/src/main/python/streaming/kafka_wordcount.py \
`$ bin/spark-submit --jars \
external/kafka-assembly/target/scala-*/spark-streaming-kafka-assembly-*.jar \
examples/src/main/python/streaming/kafka_wordcount.py \
localhost:2181 test`
"""
from __future__ import print_function
Expand Down
5 changes: 3 additions & 2 deletions examples/src/main/python/streaming/mqtt_wordcount.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@
http://www.eclipse.org/paho/#getting-started
and then run the example
`$ bin/spark-submit --jars external/mqtt-assembly/target/scala-*/\
spark-streaming-mqtt-assembly-*.jar examples/src/main/python/streaming/mqtt_wordcount.py \
`$ bin/spark-submit --jars \
external/mqtt-assembly/target/scala-*/spark-streaming-mqtt-assembly-*.jar \
examples/src/main/python/streaming/mqtt_wordcount.py \
tcp://localhost:1883 foo`
"""

Expand Down
4 changes: 2 additions & 2 deletions examples/src/main/python/streaming/queue_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@
# Create the queue through which RDDs can be pushed to
# a QueueInputDStream
rddQueue = []
for i in xrange(5):
rddQueue += [ssc.sparkContext.parallelize([j for j in xrange(1, 1001)], 10)]
for i in range(5):
rddQueue += [ssc.sparkContext.parallelize([j for j in range(1, 1001)], 10)]

# Create the QueueInputDStream and use it do some processing
inputStream = ssc.queueStream(rddQueue)
Expand Down
4 changes: 3 additions & 1 deletion python/pyspark/streaming/flume.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@

def utf8_decoder(s):
""" Decode the unicode as UTF-8 """
return s and s.decode('utf-8')
if s is None:
return None
return s.decode('utf-8')


class FlumeUtils(object):
Expand Down
4 changes: 3 additions & 1 deletion python/pyspark/streaming/kafka.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@

def utf8_decoder(s):
""" Decode the unicode as UTF-8 """
return s and s.decode('utf-8')
if s is None:
return None
return s.decode('utf-8')


class KafkaUtils(object):
Expand Down
4 changes: 3 additions & 1 deletion python/pyspark/streaming/kinesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@

def utf8_decoder(s):
""" Decode the unicode as UTF-8 """
return s and s.decode('utf-8')
if s is None:
return None
return s.decode('utf-8')


class KinesisUtils(object):
Expand Down

0 comments on commit 1f29d50

Please sign in to comment.