Skip to content
Snippets Groups Projects
Commit cb9c6797 authored by eriwe600's avatar eriwe600
Browse files

Cancellations ratio over years

parent 53179bed
No related branches found
No related tags found
No related merge requests found
File added
File added
File added
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import col, lit, sum, round
from pyspark.sql.types import IntegerType
spark = SparkSession.builder.getOrCreate()
df = spark.read.csv("/user/s2875462/airline.csv.shuffle", header='true')
# check which years cancellation was registered
'''
df1 = df.filter(~col('CancellationCode').contains('NA')) \
.select('Year', lit(1).alias('Count')) \
.groupBy('Year').agg(sum('Count').alias('Count')) \
.show()
'''
# Cancellation was registered 2003-2008, but very few 2003. Check the ratio difference between 2004 and 2008:
df = df.filter((col('Year') >= 2004) & (col('Cancelled') == 1) & ~col('CancellationCode').contains('NA'))
df1 = df.select('Year', 'CancellationCode', lit(1).alias('Cancellations')) \
.groupBy('Year', 'CancellationCode').agg(sum('Cancellations').alias('Cancellations'))
df2 = df.select('Year', lit(1).alias('TotCancellations')) \
.groupBy('Year').agg(sum('TotCancellations').alias('TotCancellations'))
df1.show()
df2.show()
''' Does not work for some reason, only run ones
def totalCancellations(year):
total = df2.filter(col('Year').contains(year))
total.show()
print(total.collect()[0].TotCancellations)
return total.collect()[0].TotCancellations
df = df1.select('Year', 'CancellationCode', ((100*col('Count')/totalCancellations(col('Year')))).alias('Percentage'))
df = df1.withColumn('Percentage', 100*df1.Count/totalCancellations(df1.Year))
'''
df = df1.join(df2, df1.Year == df2.Year) \
.select(df1.Year, df1.CancellationCode, (100*col('Cancellations')/col('TotCancellations')).alias('Percentage'))
df.show()
#df = df.select(col('Year'), col('CancellationCode'), (100*col('Cancellations')/col('TotCancellations')).alias('Percentage'))
df_years = df.sort(col('Year'))
df_years.show()
df_code = df.sort(col('CancellationCode'))
df_code.show()
......@@ -10,5 +10,9 @@ df = df.select('Distance', 'ArrDelay', 'DepDelay') \
.filter(~(col('ArrDelay').contains('NA') | col('DepDelay').contains('NA'))) \
.withColumn('ArrDelay', F.when(F.col('ArrDelay') >= 0, F.col('ArrDelay')).otherwise(0)) \
.withColumn('DepDelay', F.when(F.col('DepDelay') >= 0, F.col('DepDelay')).otherwise(0)) \
.orderBy('Distance') \
.show()
.orderBy('Distance')
df1 = df.select('Distance', 'ArrDelay') \
.filter(col('ArrDelay') != 0).show()
#df2 = df.select('Distance', 'DepDelay').show()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment