Repository: spark Updated Branches: refs/heads/master a6e0afdcf -> 0154587ab
document laziness of parallelize Took me several hours to figure out this behavior. It would be good to highlight it in the documentation. Author: Ariel Rabkin <[email protected]> Closes #1070 from asrabkin/master and squashes the following commits: 29a076e [Ariel Rabkin] doc fix Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0154587a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0154587a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0154587a Branch: refs/heads/master Commit: 0154587ab71d1b864f97497dbb38bc52b87675be Parents: a6e0afd Author: Ariel Rabkin <[email protected]> Authored: Thu Jun 12 17:51:33 2014 -0700 Committer: Reynold Xin <[email protected]> Committed: Thu Jun 12 17:51:33 2014 -0700 ---------------------------------------------------------------------- .../src/main/scala/org/apache/spark/SparkContext.scala | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/0154587a/core/src/main/scala/org/apache/spark/SparkContext.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/SparkContext.scala b/core/src/main/scala/org/apache/spark/SparkContext.scala index 8fbda2c..35970c2 100644 --- a/core/src/main/scala/org/apache/spark/SparkContext.scala +++ b/core/src/main/scala/org/apache/spark/SparkContext.scala @@ -434,12 +434,21 @@ class SparkContext(config: SparkConf) extends Logging { // Methods for creating RDDs - /** Distribute a local Scala collection to form an RDD. */ + /** Distribute a local Scala collection to form an RDD. + * + * @note Parallelize acts lazily. If `seq` is a mutable collection and is + * altered after the call to parallelize and before the first action on the + * RDD, the resultant RDD will reflect the modified collection. Pass a copy of + * the argument to avoid this. + */ def parallelize[T: ClassTag](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T] = { new ParallelCollectionRDD[T](this, seq, numSlices, Map[Int, Seq[String]]()) } - /** Distribute a local Scala collection to form an RDD. */ + /** Distribute a local Scala collection to form an RDD. + * + * This method is identical to `parallelize`. + */ def makeRDD[T: ClassTag](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T] = { parallelize(seq, numSlices) }
