Spark
Installing and connecting to a standalone Spark cluster in R
library(sparklyr)
library(tidyverse)
spark_install(version=”3.5.3”, hadoop_version=”3”)
<- spark_connect(master=”local”)
sc spark_disconnect(sc)
Setting up a Spark cluster and connecting to it
In the terminal (Spark root directory), start the cluster:
./sbin/start-master.sh
./sbin/start-worker.sh <spark URL>
<- spark_connect(master=<spark URL>, spark_home=<spark location>)
sc sc
Read data
<- spark_read_csv(sc, path="/tmp/data/code_stats.csv", name="code", escape="\"")
code <- spark_read_csv(sc, path="/tmp/data/bug_stats.csv", name="bugs", escape="\"")
bugs
# Show available tables
src_tbls(con)
Working with data
# "Print" the bugs spark data frame
bugs
# Some aggregation
<- bugs |> group_by(PID, BID) |> summarize(Del=sum(DelLines))
bugs.agg
|> compute()
bugs.agg <- bugs.agg |> collect() df