Some classes of stock prices prediction

Bruno Silva · Bruno Silva · commit 24ff5553cf8c · 2017-12-21T17:07:54.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -9,4 +9,6 @@
 /project
 /derby.log
 /metastore_db
-/spark-warehouse
+/spark-warehouse
+/stocks.parquet
+/emas.parquet
diff --git a/hadoop/bin/winutils.exe b/hadoop/bin/winutils.exe
diff --git a/src/main/scala/com/dev/bruno/ml/SparkApp.scala b/src/main/scala/com/dev/bruno/ml/SparkApp.scala
diff --git a/src/main/scala/com/dev/bruno/ml/lr/ClosePriceTask.scala b/src/main/scala/com/dev/bruno/ml/lr/ClosePriceTask.scala
@@ -0,0 +1,67 @@
+package com.dev.bruno.ml.lr
+
+import java.io.File
+
+object ClosePriceTask {
+
+  import org.apache.spark.SparkConf
+  import org.apache.spark.ml.feature.VectorAssembler
+  import org.apache.spark.ml.regression.LinearRegression
+  import org.apache.spark.sql.SparkSession
+
+  def main(args: Array[String]): Unit = {
+    val source = new File("./stocks.parquet")
+
+    if (!source.exists || !source.isDirectory) {
+      System.out.println("Please execute CsvMergeTask class before.")
+      return
+    }
+
+    // Dependency to run in standalone mode on windows
+    val hadoopFolder = new File("./hadoop").getAbsolutePath
+    System.setProperty("hadoop.home.dir", hadoopFolder)
+
+    // Basic configuration
+    val conf = new SparkConf()
+      .setAppName("ClosePriceTask")
+      .setMaster("local[*]")
+
+    // Initialization of Spark And Spark SQL Context
+    val sqlContext = SparkSession.builder.config(conf).getOrCreate
+
+    // Columns to be use as input in Linear Regression Algorithm
+    val features = Array("Open", "High", "Low", "NameIndex")
+
+    // It is necessary to aggregate all features in one array
+    // to use Linear Regression Algorithm
+    val assembler = new VectorAssembler()
+      .setInputCols(features)
+      .setOutputCol("features")
+
+    val dataset = sqlContext.read.parquet("stocks.parquet")
+
+    val featuredDataset = assembler.transform(dataset).sort("Date")
+
+    // Split our dataset in two random ones for training and testing
+    val trainingDataset = featuredDataset.filter("Date <= '2016-12-31'")
+    val testDataset = featuredDataset.filter("Date > '2016-12-31'")
+
+    // Linear Regression Algorithm
+    // TODO Try to understand why we need to use setLabelCol
+    val linearRegression = new LinearRegression()
+      .setLabelCol("Close")
+      .setFeaturesCol("features")
+      .setPredictionCol("ClosePredicted")
+
+    // Our training model to use in prediction
+    val model = linearRegression.fit(trainingDataset)
+
+    // A new column called prediction will be included in testDataset
+    val predictedDataset = model.transform(testDataset)
+
+    // Selecting only important columns to compare and show
+    predictedDataset.select("Date", "Name", "Close", "ClosePredicted").show()
+
+    sqlContext.close()
+  }
+}
diff --git a/src/main/scala/com/dev/bruno/ml/lr/NextOpenPriceTask.scala b/src/main/scala/com/dev/bruno/ml/lr/NextOpenPriceTask.scala
@@ -0,0 +1,78 @@
+package com.dev.bruno.ml.lr
+
+import java.io.File
+
+object NextOpenPriceTask {
+
+  import org.apache.spark.SparkConf
+  import org.apache.spark.ml.feature.VectorAssembler
+  import org.apache.spark.ml.regression.LinearRegression
+  import org.apache.spark.sql.SparkSession
+
+  def main(args: Array[String]): Unit = {
+    val source = new File("./stocks.parquet")
+
+    if (!source.exists || !source.isDirectory) {
+      System.out.println("Please execute CsvMergeTask class before.")
+      return
+    }
+
+    // Dependency to run in standalone mode on windows
+    val hadoopFolder = new File("./hadoop").getAbsolutePath
+    System.setProperty("hadoop.home.dir", hadoopFolder)
+
+    // Basic configuration
+    val conf = new SparkConf()
+      .setAppName("NextOpenPriceTask")
+      .setMaster("local[*]")
+
+    // Initialization of Spark And Spark SQL Context
+    val sqlContext = SparkSession.builder.config(conf).getOrCreate
+
+    val dataset = sqlContext.read.parquet("stocks.parquet")
+
+    // Geting the NextOpenPrice for all dataset
+    dataset.createOrReplaceTempView("temp_stocks")
+
+    val nextOpenDatasetSql = "select date_add(Date, -1) as Date, " + "NameIndex, Open as NextOpenPrice from temp_stocks "
+
+    val nextOpenDataset = sqlContext.sql(nextOpenDatasetSql)
+    nextOpenDataset.createOrReplaceTempView("temp_next_openprice")
+
+    val sql = "select s.*, o.NextOpenPrice from temp_stocks s, temp_next_openprice o" + " where to_date(s.Date) = o.Date and s.NameIndex = o.NameIndex"
+    val updatedDataset = sqlContext.sql(sql)
+
+    // Columns to be use as input in Linear Regression Algorithm
+    val features = Array("Open", "Close", "High", "Low", "NameIndex")
+
+    // It is necessary to aggregate all features in one array
+    // to use Linear Regression Algorithm
+    val assembler = new VectorAssembler()
+      .setInputCols(features)
+      .setOutputCol("features")
+
+    // Linear Regration Algorithm
+    // TODO Try to understand why we need to use setLabelCol
+    val linearRegression = new LinearRegression
+    linearRegression.setLabelCol("NextOpenPrice")
+    linearRegression.setFeaturesCol("features")
+    linearRegression.setPredictionCol("NextOpenPricePredicted")
+
+    val featuredDataset = assembler.transform(updatedDataset).sort("Date")
+
+    // Split our dataset in two random ones for training and testing
+    val trainingDataset = featuredDataset.filter("Date <= '2016-12-31'")
+    val testDataset = featuredDataset.filter("Date > '2016-12-31'")
+
+    // Our training model to use in prediction
+    val model = linearRegression.fit(trainingDataset)
+
+    // A new column called prediction will be included in testDataset
+    val predictedDataset = model.transform(testDataset)
+
+    // Selecting only important columns to compare and show
+    predictedDataset.select("Date", "Name", "NextOpenPrice", "NextOpenPricePredicted").show()
+
+    sqlContext.close()
+  }
+}
diff --git a/src/main/scala/com/dev/bruno/ml/model/Ema.scala b/src/main/scala/com/dev/bruno/ml/model/Ema.scala
@@ -0,0 +1,47 @@
+package com.dev.bruno.ml.model
+
+import java.sql.Timestamp
+
+@SerialVersionUID(100L)
+class Ema extends Serializable {
+
+  private var _nameIndex: Double = .0
+
+  private var _ema6: Double = .0
+
+  private var _ema10: Double = .0
+
+  private var _date: Timestamp = _
+
+  def this(nameIndex: Double, ema6: Double, ema10: Double, date: Timestamp) {
+    this()
+    this._nameIndex = nameIndex
+    this._date = date
+    this._ema6 = ema6
+    this._ema10 = ema10
+  }
+
+  def nameIndex: Double = _nameIndex
+
+  def nameIndex(nameIndex: Double): Unit = {
+    this._nameIndex = nameIndex
+  }
+
+  def date: Timestamp = _date
+
+  def date(date: Timestamp): Unit = {
+    this._date = date
+  }
+
+  def ema6: Double = _ema6
+
+  def ema6(ema6: Double): Unit = {
+    this._ema6 = ema6
+  }
+
+  def ema10: Double = _ema10
+
+  def ema10(ema10: Double): Unit = {
+    this._ema10 = ema10
+  }
+}
diff --git a/src/main/scala/com/dev/bruno/ml/model/Stock.scala b/src/main/scala/com/dev/bruno/ml/model/Stock.scala
@@ -0,0 +1,71 @@
+package com.dev.bruno.ml.model
+
+import java.sql.Timestamp
+
+@SerialVersionUID(100L)
+class Stock extends Serializable {
+
+  private var _name: String = _
+
+  private var _nameIndex: Double = .0
+
+  private var _date: Timestamp = _
+
+  private var _open: Double = .0
+
+  private var _close: Double = .0
+
+  private var _low: Double = .0
+
+  private var _high: Double = .0
+
+  private var _volume: Double = .0
+
+  def name: String = _name
+
+  def name(name: String): Unit = {
+    this._name = name
+  }
+
+  def nameIndex: Double = _nameIndex
+
+  def nameIndex(nameIndex: Double): Unit = {
+    this._nameIndex = nameIndex
+  }
+
+  def date: Timestamp = _date
+
+  def date(date: Timestamp): Unit = {
+    this._date = date
+  }
+
+  def open: Double = _open
+
+  def open(open: Double): Unit = {
+    this._open = open
+  }
+
+  def close: Double = _close
+
+  def close(close: Double): Unit = {
+    this._close = close
+  }
+
+  def low: Double = _low
+
+  def low(low: Double): Unit = {
+    this._low = low
+  }
+
+  def high: Double = _high
+
+  def high(high: Double): Unit = {
+    this._high = high
+  }
+
+  def volume: Double = _volume
+
+  def volume(volume: Double): Unit = {
+    this._volume = volume
+  }
+}
diff --git a/src/main/scala/com/dev/bruno/ml/util/CsvMergeTask.scala b/src/main/scala/com/dev/bruno/ml/util/CsvMergeTask.scala
@@ -0,0 +1,51 @@
+package com.dev.bruno.ml.util
+
+import java.io.File
+
+import org.apache.spark.SparkConf
+import org.apache.spark.ml.feature.StringIndexer
+import org.apache.spark.sql._
+
+object CsvMergeTask {
+
+  def main(args: Array[String]): Unit = {
+
+    if (args.length == 0) {
+      println("Please inform as args the location of CSV files.")
+      return
+    }
+
+    // Dependency to run in standalone mode on windows
+    val hadoopFolder = new File("./hadoop").getAbsolutePath
+    System.setProperty("hadoop.home.dir", hadoopFolder)
+
+    // Basic configuration
+    val conf = new SparkConf()
+      .setAppName("CsvMergeTask")
+      .setMaster("local[*]")
+
+    // Initialization Spark SQL Context
+    val sqlContext = SparkSession.builder.config(conf).getOrCreate
+
+    val sparkContext = sqlContext.sparkContext
+
+    val reader = sqlContext.read
+      .format("com.databricks.spark.csv")
+      .option("header", "true") // The CSV file has header and use them as column names
+      .option("inferSchema", "true") // Discover the column types
+
+    //Loading CSV files directory
+    val filter: String = "Open is not null and High is not null and Low is not null " + " and Volume is not null and Date is not null and Name is not null"
+    val dataset = reader.load(args(0)).filter(filter).distinct()
+
+    // Creating a index to use Name as a feature on Linear Regression
+    val indexer = new StringIndexer()
+      .setInputCol("Name")
+      .setOutputCol("NameIndex")
+
+    val indexedDataset = indexer.fit(dataset).transform(dataset)
+
+    // Saving the dataset as parquet
+    indexedDataset.write.mode(SaveMode.Overwrite).parquet("stocks.parquet")
+  }
+}
diff --git a/src/main/scala/com/dev/bruno/ml/util/EmaCalculationTask.scala b/src/main/scala/com/dev/bruno/ml/util/EmaCalculationTask.scala