Skip to content
Snippets Groups Projects
Commit 46f2740c authored by a.b.wahlgren@student.utwente.nl's avatar a.b.wahlgren@student.utwente.nl
Browse files

Beginning to gather proportion of delay averages

parent f846dfdf
No related branches found
No related tags found
No related merge requests found
...@@ -30,3 +30,29 @@ df_w = df.select(col("WeatherDelay")).where(col("WeatherDelay") > 0) ...@@ -30,3 +30,29 @@ df_w = df.select(col("WeatherDelay")).where(col("WeatherDelay") > 0)
sum_w = df_w.rdd.map(lambda x: (1, x[0])).reduceByKey(lambda x, y: int(x) + int(y)).collect()[0][1] sum_w = df_w.rdd.map(lambda x: (1, x[0])).reduceByKey(lambda x, y: int(x) + int(y)).collect()[0][1]
avg_w = sum_w / df_w.count() avg_w = sum_w / df_w.count()
print("The average delay caused by weather conditions is "+str(avg_w)+" minutes.") print("The average delay caused by weather conditions is "+str(avg_w)+" minutes.")
'''
--- Average Arrival Delay ---
'''
df_arr = df.select(col("ArrDelay")).where(col("ArrDelay") > 0)
sum_arr = df_arr.rdd.map(lambda x: (1, x[0])).reduceByKey(lambda x, y: int(x) + int(y)).collect()[0][1]
avg_arr = sum_arr / df_arr.count()
print("The average arrival delay is "+str(avg_arr)+" minutes.")
'''
--- Average Departure Delay ---
'''
df_dep = df.select(col("DepDelay")).where(col("DepDelay") > 0)
sum_dep = df_dep.rdd.map(lambda x: (1, x[0])).reduceByKey(lambda x, y: int(x) + int(y)).collect()[0][1]
avg_dep = sum_dep / df_dep.count()
print("The average departure delay is "+str(avg_dep)+" minutes.")
'''
--- Average Carrier Delay ---
'''
df_car = df.select(col("CarrierDelay")).where(col("CarrierDelay") > 0)
sum_car = df_arr.rdd.map(lambda x: (1, x[0])).reduceByKey(lambda x, y: int(x) + int(y)).collect()[0][1]
avg_car = sum_car / df_car.count()
print("The average carrier delay caused is "+str(avg_car)+" minutes.")
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max as pmax
spark = SparkSession.builder.getOrCreate()
df = spark.read.csv("/user/s2875462/airline.csv.shuffle", header="true")
'''
QUESTION: Is there a certain threshold where the total delay causes flights to be canceled?
Local run:
Cluster run:
Approach:
1. Get all flights which are cancelled and have been delayed
2. Sort them on department delay
3. Look for patterns
Output:
'''
df2 = df.select(col("DepDelay"), col("Cancelled"))\
.where(col("DepDelay") >= 0)\
.sort(col("DepDelay"))
tot = df2.count()
df3 = df2.limit(tot//4).select(pmax("DepDelay").alias("max"))
df3.show()
df4 = df2.limit(2*tot//4).select(pmax("DepDelay").alias("max"))
df4.show()
df5 = df2.limit(3*tot//4).select(pmax("DepDelay").alias("max"))
df5.show()
df6 = df2.select(pmax("DepDelay").alias("max"))
df6.show()
# .where((col("Cancelled") == 1) & (col("DepDelay") > 0))
df2.show(10)
print(df2.count())
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment