Newer
Older
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import col, lit, sum, round
from pyspark.sql.types import IntegerType
spark = SparkSession.builder.getOrCreate()
# check which years cancellation was registered
'''
df1 = df.filter(~col('CancellationCode').contains('NA')) \
.select('Year', lit(1).alias('Count')) \
.groupBy('Year').agg(sum('Count').alias('Count')) \
.show()
'''
# Cancellation was registered 2003-2008, but very few 2003. Check the ratio difference between 2004 and 2008:
df = df.filter((col('Year') >= 2004) & (col('Cancelled') == 1) & ~col('CancellationCode').contains('NA'))
df1 = df.select('Year', 'CancellationCode', lit(1).alias('Cancellations')) \
.groupBy('Year', 'CancellationCode').agg(sum('Cancellations').alias('Cancellations'))
df2 = df.select('Year', lit(1).alias('TotCancellations')) \
.groupBy('Year').agg(sum('TotCancellations').alias('TotCancellations'))
df = df1.join(df2, df1.Year == df2.Year) \
.select(df1.Year, df1.CancellationCode, (100*col('Cancellations')/col('TotCancellations')).alias('Percentage')) \
.sort(col('Year'))