Beginning to gather proportion of delay averages

46f2740c · a.b.wahlgren@student.utwente.nl · f846dfdf · 46f2740c · 46f2740c
Commit 46f2740c authored 3 years ago by a.b.wahlgren@student.utwente.nl
--- a/avg_delays.py
+++ b/avg_delays.py
@@ -30,3 +30,29 @@ df_w = df.select(col("WeatherDelay")).where(col("WeatherDelay") > 0)
 sum_w = df_w.rdd.map(lambda x: (1, x[0])).reduceByKey(lambda x, y: int(x) + int(y)).collect()[0][1]
 avg_w = sum_w / df_w.count()
 print("The average delay caused by weather conditions is "+str(avg_w)+" minutes.")
+'''
+   --- Average Arrival Delay ---
+'''
+df_arr = df.select(col("ArrDelay")).where(col("ArrDelay") > 0)
+sum_arr = df_arr.rdd.map(lambda x: (1, x[0])).reduceByKey(lambda x, y: int(x) + int(y)).collect()[0][1]
+avg_arr = sum_arr / df_arr.count()
+print("The average arrival delay is "+str(avg_arr)+" minutes.")
+'''
+   --- Average Departure Delay ---
+'''
+   df_dep = df.select(col("DepDelay")).where(col("DepDelay") > 0)
+   sum_dep = df_dep.rdd.map(lambda x: (1, x[0])).reduceByKey(lambda x, y: int(x) + int(y)).collect()[0][1]
+   avg_dep = sum_dep / df_dep.count()
+   print("The average departure delay is "+str(avg_dep)+" minutes.")
+'''
+   --- Average Carrier Delay ---
+'''
+df_car = df.select(col("CarrierDelay")).where(col("CarrierDelay") > 0)
+sum_car = df_arr.rdd.map(lambda x: (1, x[0])).reduceByKey(lambda x, y: int(x) + int(y)).collect()[0][1]
+avg_car = sum_car / df_car.count()
+print("The average carrier delay caused is "+str(avg_car)+" minutes.")
--- a/canc_threshold.py
+++ b/canc_threshold.py
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import col, max as pmax
+spark = SparkSession.builder.getOrCreate()
+df = spark.read.csv("/user/s2875462/airline.csv.shuffle", header="true")
+'''
+QUESTION: Is there a certain threshold where the total delay causes flights to be canceled?
+Local run:
+Cluster run:
+Approach:
+    1. Get all flights which are cancelled and have been delayed
+    2. Sort them on department delay
+    3. Look for patterns
+Output:
+'''
+df2 = df.select(col("DepDelay"), col("Cancelled"))\
+         .where(col("DepDelay") >= 0)\
+         .sort(col("DepDelay"))
+tot = df2.count()
+df3 = df2.limit(tot//4).select(pmax("DepDelay").alias("max"))
+df3.show()
+df4 = df2.limit(2*tot//4).select(pmax("DepDelay").alias("max"))
+df4.show()
+df5 = df2.limit(3*tot//4).select(pmax("DepDelay").alias("max"))
+df5.show()
+df6 = df2.select(pmax("DepDelay").alias("max"))
+df6.show()
+#        .where((col("Cancelled") == 1) & (col("DepDelay") > 0))
+df2.show(10)
+print(df2.count())