Skip to content
Snippets Groups Projects
Commit 07fb92e0 authored by s2300397's avatar s2300397
Browse files

added the run time in distributed mode

parent 5043a506
No related branches found
No related tags found
No related merge requests found
#project code
#group 3
#Run time for distributed mode with 10 executer is 2m39s
from pyspark import SparkContext
import pyspark.sql.functions as F
from pyspark.sql.functions import col, round
......@@ -77,7 +80,7 @@ df_arrival=df.select('FlightNum', 'ArrDelay', 'DepDelay')
#arrival delay time analysis
from pyspark.mllib.stat import Statistics
df_arrival=ddf_arrival.withColumn('ArrDelay', df_arrival['ArrDelay'].cast(IntegerType()))
df_arrival=df_arrival.withColumn('ArrDelay', df_arrival['ArrDelay'].cast(IntegerType()))
df_arrival=df_arrival.withColumn('DepDelay', df_arrival['DepDelay'].cast(IntegerType()))
correlation=df_arrival.stat.corr("ArrDelay", "DepDelay")
print(str(correlation)) #the correlation is 0.86 which means that there is a positive strong correlation between departure delay and arrival delay
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment