Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import col, lit, sum, round
from pyspark.sql.types import IntegerType
spark = SparkSession.builder.getOrCreate()
df = spark.read.csv("/user/s2875462/airline.csv.shuffle", header='true')
# check which years cancellation was registered
'''
df1 = df.filter(~col('CancellationCode').contains('NA')) \
.select('Year', lit(1).alias('Count')) \
.groupBy('Year').agg(sum('Count').alias('Count')) \
.show()
'''
# Cancellation was registered 2003-2008, but very few 2003. Check the ratio difference between 2004 and 2008:
df = df.filter((col('Year') >= 2004) & (col('Cancelled') == 1) & ~col('CancellationCode').contains('NA'))
df1 = df.select('Year', 'CancellationCode', lit(1).alias('Cancellations')) \
.groupBy('Year', 'CancellationCode').agg(sum('Cancellations').alias('Cancellations'))
df2 = df.select('Year', lit(1).alias('TotCancellations')) \
.groupBy('Year').agg(sum('TotCancellations').alias('TotCancellations'))
df1.show()
df2.show()
''' Does not work for some reason, only run ones
def totalCancellations(year):
total = df2.filter(col('Year').contains(year))
total.show()
print(total.collect()[0].TotCancellations)
return total.collect()[0].TotCancellations
df = df1.select('Year', 'CancellationCode', ((100*col('Count')/totalCancellations(col('Year')))).alias('Percentage'))
df = df1.withColumn('Percentage', 100*df1.Count/totalCancellations(df1.Year))
'''
df = df1.join(df2, df1.Year == df2.Year) \
.select(df1.Year, df1.CancellationCode, (100*col('Cancellations')/col('TotCancellations')).alias('Percentage'))
df.show()
#df = df.select(col('Year'), col('CancellationCode'), (100*col('Cancellations')/col('TotCancellations')).alias('Percentage'))
df_years = df.sort(col('Year'))
df_years.show()
df_code = df.sort(col('CancellationCode'))
df_code.show()