Spark
Installing and connecting to a standalone Spark cluster in R
library(sparklyr)
library(tidyverse)
spark_install(version=”3.5.3”, hadoop_version=”3”)
sc <- spark_connect(master=”local”)
spark_disconnect(sc)Setting up a Spark cluster and connecting to it
In the terminal (Spark root directory), start the cluster:
./sbin/start-master.sh
./sbin/start-worker.sh <spark URL>
sc <- spark_connect(master=<spark URL>, spark_home=<spark location>)
scRead data
code <- spark_read_csv(sc, path="/tmp/data/code_stats.csv", name="code", escape="\"")
bugs <- spark_read_csv(sc, path="/tmp/data/bug_stats.csv", name="bugs", escape="\"")
# Show available tables
src_tbls(con)Working with data
# "Print" the bugs spark data frame
bugs
# Some aggregation
bugs.agg <- bugs |> group_by(PID, BID) |> summarize(Del=sum(DelLines))
bugs.agg |> compute()
df <- bugs.agg |> collect()