Now the main entry point is spark instead of sc and sqlContext
pyspark
module load anaconda2
PYSPARK_DRIVER_PYTHON=ipython pyspark
from pyspark.sql import Row
Person = Row('name', 'surname')
data = []
data.append(Person('Joe', 'MacMillan'))
data.append(Person('Gordon', 'Clark'))
data.append(Person('Cameron', 'Howe'))
df = spark.createDataFrame(data)
df.show()
+-------+---------+
| name| surname|
+-------+---------+
| Joe|MacMillan|
| Gordon| Clark|
|Cameron| Howe|
+-------+---------+
# client mode
spark-submit --master yarn \
--name testWC test.py input output
# cluster mode
spark-submit --master yarn --deploy-mode cluster \
--name testWC test.py input output
# client mode
spark-submit --master yarn --name testWC \
--class es.cesga.hadoop.Test test.jar \
input output
# cluster mode
spark-submit --master yarn --deploy-mode cluster \
--name testWC \
--class es.cesga.hadoop.Test test.jar \
input output
--num-executors NUM Number of executors to launch (Default: 2)
--executor-cores NUM Number of cores per executor. (Default: 1)
--driver-cores NUM Number of cores for driver (cluster mode)
--executor-memory MEM Memory per executor (Default: 1G)
--queue QUEUE_NAME The YARN queue to submit to (Default: "default")
--proxy-user NAME User to impersonate