library(SparkR, lib.loc = paste(Sys.getenv("SPARK_HOME"), "/R/lib", sep = ""))
sc <- sparkR.session(master = "local")
df1 <- read.df("nycflights13.csv", source = "csv", header = "true", inferSchema = "true")
grp1 <- groupBy(filter(df1, "month in (1, 2, 3)"), "month")
sum1 <- withColumnRenamed(agg(grp1, min_dep = min(df1$dep_delay)), "month", "month1")
grp2 <- groupBy(filter(df1, "month in (2, 3, 4)"), "month")
sum2 <- withColumnRenamed(agg(grp2, max_dep = max(df1$dep_delay)), "month", "month2")
# INNER JOIN
showDF(merge(sum1, sum2, by.x = "month1", by.y = "month2", all = FALSE))
showDF(join(sum1, sum2, sum1$month1 == sum2$month2, "inner"))
#+------+-------+------+-------+
#|month1|min_dep|month2|max_dep|
#+------+-------+------+-------+
#| 3| -25| 3| 911|
#| 2| -33| 2| 853|
#+------+-------+------+-------+
# LEFT JOIN
showDF(merge(sum1, sum2, by.x = "month1", by.y = "month2", all.x = TRUE))
showDF(join(sum1, sum2, sum1$month1 == sum2$month2, "left"))
#+------+-------+------+-------+
#|month1|min_dep|month2|max_dep|
#+------+-------+------+-------+
#| 1| -30| null| null|
#| 3| -25| 3| 911|
#| 2| -33| 2| 853|
#+------+-------+------+-------+
# RIGHT JOIN
showDF(merge(sum1, sum2, by.x = "month1", by.y = "month2", all.y = TRUE))
showDF(join(sum1, sum2, sum1$month1 == sum2$month2, "right"))
#+------+-------+------+-------+
#|month1|min_dep|month2|max_dep|
#+------+-------+------+-------+
#| 3| -25| 3| 911|
#| null| null| 4| 960|
#| 2| -33| 2| 853|
#+------+-------+------+-------+
# FULL JOIN
showDF(merge(sum1, sum2, by.x = "month1", by.y = "month2", all = TRUE))
showDF(join(sum1, sum2, sum1$month1 == sum2$month2, "full"))
#+------+-------+------+-------+
#|month1|min_dep|month2|max_dep|
#+------+-------+------+-------+
#| 1| -30| null| null|
#| 3| -25| 3| 911|
#| null| null| 4| 960|
#| 2| -33| 2| 853|
#+------+-------+------+-------+
Like this:
Like Loading...