# Initialize Spark

import pyspark
from pyspark.sql import SparkSession, Row

# Change N in local[N] to change number of resources 
# Note change in execution time
conf = pyspark.SparkConf().setAll([('spark.master', 'local[1]'),
                                   ('spark.app.name', 'Spark Demo'),
                                   ])
spark = SparkSession.builder.config(conf=conf).getOrCreate()

print (pyspark.version.__version__)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/22 17:29:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable

3.5.5

print(spark.sparkContext.defaultParallelism)

1

import time
start_time = time.time()

# Read data into Spark DataFrame 
# Data source: https://jmcauley.ucsd.edu/data/amazon/ 

from os.path import expanduser
HOME = expanduser("~")

data_path = HOME + '/data/'

dataFileName = data_path + "BookReviews_5M.txt"
textDF = spark.read.text(dataFileName)

%%time

textDF.count()

[Stage 0:====================================================>      (8 + 1) / 9]

CPU times: user 4.56 ms, sys: 191 μs, total: 4.75 ms
Wall time: 3.05 s

5000000

textDF.show()

+--------------------+
|               value|
+--------------------+
|This was the firs...|
|Also after going ...|
|As with all of Ms...|
|I've not read any...|
|This romance nove...|
|Carolina Garcia A...|
|Not only can she ...|
|Once again Garcia...|
|The timing is jus...|
|Engaging. Dark. R...|
|Set amid the back...|
|This novel is a d...|
|If readers are ad...|
| Reviewed by Phyllis|
|      APOOO BookClub|
|A guilty pleasure...|
|In the tradition ...|
|Beryl Unger, top ...|
|What follows is a...|
|The book flap say...|
+--------------------+
only showing top 20 rows

%%time

from pyspark.sql.functions import col

filteredDF = textDF.filter(col('value').contains('science'))

CPU times: user 1.29 ms, sys: 38 μs, total: 1.33 ms
Wall time: 30.8 ms

# Uncomment to show entire untruncated review 
# filteredDF.show(truncate=False)
filteredDF.show()

+--------------------+
|               value|
+--------------------+
|If so then it des...|
|This book was an ...|
|A "Welcome to the...|
|Despite painful g...|
|The storyline its...|
|Recommended if yo...|
|I don't really ge...|
|As a reader and r...|
|Sony E-readers:  ...|
|I had rather hope...|
|In the early 1980...|
|You just have to ...|
|Genres in fiction...|
|Although this see...|
|This is one of th...|
|I'm not normally ...|
|From the first pa...|
|Predators of Dark...|
|Predators Of Dark...|
|In short: Highly ...|
+--------------------+
only showing top 20 rows

%%time 

filteredDF.count()

[Stage 5:====================================================>      (8 + 1) / 9]

CPU times: user 2.37 ms, sys: 3.75 ms, total: 6.13 ms
Wall time: 3.31 s

1403

print(time.time() - start_time)

8.840343475341797

spark.stop()

PySpark Scaling Demo¶

CIML Summer Institute¶

UC San Diego¶

Setup¶

Record execution start time¶

Read data¶

Process data¶

Count number of rows¶

Show first few rows¶

Filter for reviews with the word 'science'¶

Print time since execution start¶

Stop Spark Session¶