Merge branch 'master' into fix-read

microsoft · Feb 26, 2021 · 4d57899 · 4d57899
2 parents 1912639 + 9cff1e6
commit 4d57899
Show file tree

Hide file tree

Showing 121 changed files with 1,260 additions and 1,147 deletions.
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 # Microsoft Machine Learning for Apache Spark
 
-[![Build Status](https://msazure.visualstudio.com/Cognitive%20Services/_apis/build/status/Azure.mmlspark?branchName=master)](https://msazure.visualstudio.com/Cognitive%20Services/_build/latest?definitionId=83120&branchName=master) [![codecov](https://codecov.io/gh/Azure/mmlspark/branch/master/graph/badge.svg)](https://codecov.io/gh/Azure/mmlspark) [![Gitter](https://badges.gitter.im/Microsoft/MMLSpark.svg)](https://gitter.im/Microsoft/MMLSpark?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge) 
+[![Build Status](https://msdata.visualstudio.com/A365/_apis/build/status/Azure.mmlspark?branchName=master)](https://msdata.visualstudio.com/A365/_build/latest?definitionId=15131&branchName=master) [![codecov](https://codecov.io/gh/Azure/mmlspark/branch/master/graph/badge.svg)](https://codecov.io/gh/Azure/mmlspark) [![Gitter](https://badges.gitter.im/Microsoft/MMLSpark.svg)](https://gitter.im/Microsoft/MMLSpark?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge) 
 
 [![Release Notes](https://img.shields.io/badge/release-notes-blue)](https://github.com/Azure/mmlspark/releases) [![Scala Docs](https://img.shields.io/static/v1?label=api%20docs&message=scala&color=blue&logo=scala)](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/scala/index.html#package) [![PySpark Docs](https://img.shields.io/static/v1?label=api%20docs&message=python&color=blue&logo=python)](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/index.html) [![Academic Paper](https://img.shields.io/badge/academic-paper-7fdcf7)](https://arxiv.org/abs/1810.08744)
 
@@ -60,7 +60,7 @@ PySpark](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/index.htm
 
 | <img width="150" src="https://mmlspark.blob.core.windows.net/graphics/emails/isolation forest 3.svg"> |<img width="150" src="https://mmlspark.blob.core.windows.net/graphics/emails/cyberml.svg">   | <img width="150" src="https://mmlspark.blob.core.windows.net/graphics/emails/conditional_knn.svg">  |
 |:--:|:--:|:--:|
-|  **Isolation Forest on Spark**  | **CyberML** | **Conditional KNN**  |
+|  **Isolation Forest on Spark**  | [**CyberML**](https://github.com/Azure/mmlspark/blob/master/notebooks/samples/CyberML%20-%20Anomalous%20Access%20Detection.ipynb) | **Conditional KNN**  |
 |  Distributed Nonlinear Outlier Detection | Machine Learning Tools for Cyber Security | Scalable KNN Models with Conditional Queries | 
 
 
@@ -84,6 +84,7 @@ PySpark](https://mmlspark.blob.core.windows.net/docs/1.0.0-rc3/pyspark/index.htm
 -   Train classification and regression models easily via implicit featurization
     of data ([example 1])
 -   Train and evaluate a flight delay prediction system ([example 2])
+-   Finding anomalous data access patterns using the Access Anomalies package of CyberML ([example 11])
 
 See our [notebooks](notebooks/samples/) for all examples.
 
@@ -107,6 +108,8 @@ See our [notebooks](notebooks/samples/) for all examples.
 
 [example 10]: notebooks/gpu/DeepLearning%20-%20Distributed%20CNTK%20training.ipynb "CIFAR10 CNTK CNN Training"
 
+[example 11]: notebooks/samples/CyberML%20-%20Anomalous%20Access%20Detection.ipynb "Access Anomalies documenation, training and evaluation example"
+
 ## A short example
 
 Below is an excerpt from a simple example of using a pre-trained CNN to

diff --git a/build.sbt b/build.sbt
@@ -11,12 +11,15 @@ import scala.sys.process.Process
 val condaEnvName = "mmlspark"
 name := "mmlspark"
 organization := "com.microsoft.ml.spark"
-scalaVersion := "2.11.12"
+scalaVersion := "2.12.10"
+val sparkVersion = "3.0.1"
 
-val sparkVersion = "2.4.5"
+//val scalaMajorVersion  = settingKey[String]("scalaMajorVersion")
+//scalaMajorVersion  := {scalaVersion.value.split(".".toCharArray).dropRight(0).mkString(".")}
+val scalaMajorVersion = 2.12
 
 val excludes = Seq(
-  ExclusionRule("org.apache.spark", "spark-tags_2.11"),
+  ExclusionRule("org.apache.spark", s"spark-tags_$scalaMajorVersion"),
   ExclusionRule("org.scalatic"),
   ExclusionRule("org.scalatest")
 )
@@ -35,9 +38,9 @@ libraryDependencies ++= Seq(
   "org.apache.httpcomponents" % "httpclient" % "4.5.6" excludeAll (excludes: _*),
   "org.apache.httpcomponents" % "httpmime" % "4.5.6" excludeAll (excludes: _*),
   "com.microsoft.ml.lightgbm" % "lightgbmlib" % "2.3.180" excludeAll (excludes: _*),
-  "com.github.vowpalwabbit" % "vw-jni" % "8.8.1" excludeAll (excludes: _*),
-  "com.linkedin.isolation-forest" %% "isolation-forest_2.4.3" % "0.3.2" excludeAll (excludes: _*),
-  "org.apache.spark" %% "spark-avro" % sparkVersion % "provided",
+  "com.github.vowpalwabbit" % "vw-jni" % "8.9.1" excludeAll (excludes: _*),
+  "com.linkedin.isolation-forest" %% "isolation-forest_3.0.0" % "1.0.1" excludeAll (excludes: _*),
+  "org.apache.spark" %% "spark-avro" % sparkVersion % "provided"
 )
 
 def txt(e: Elem, label: String): String = "\"" + e.child.filter(_.label == label).flatMap(_.text).mkString + "\""
@@ -111,9 +114,11 @@ def activateCondaEnv: Seq[String] = {
   }
 }
 
+
+
 val packagePythonTask = TaskKey[Unit]("packagePython", "Package python sdk")
-val genDir = join("target", "scala-2.11", "generated")
-val unidocDir = join("target", "scala-2.11", "unidoc")
+val genDir = join("target", s"scala-${scalaMajorVersion}", "generated")
+val unidocDir = join("target", s"scala-${scalaMajorVersion}", "unidoc")
 val pythonSrcDir = join(genDir.toString, "src", "python")
 val unifiedDocDir = join(genDir.toString, "doc")
 val pythonDocDir = join(unifiedDocDir.toString, "pyspark")
@@ -198,7 +203,7 @@ val publishR = TaskKey[Unit]("publishR", "publish R package to blob")
 publishR := {
   val s = streams.value
   (runMain in Test).toTask(" com.microsoft.ml.spark.codegen.CodeGen").value
-  val rPackage = join("target", "scala-2.11", "generated", "package", "R")
+  val rPackage = join("target", s"scala-${scalaMajorVersion}", "generated", "package", "R")
     .listFiles().head
   singleUploadToBlob(rPackage.toString, rPackage.getName, "rrr", s.log)
 }
@@ -207,7 +212,7 @@ packagePythonTask := {
   val s = streams.value
   (runMain in Test).toTask(" com.microsoft.ml.spark.codegen.CodeGen").value
   createCondaEnvTask.value
-  val destPyDir = join("target", "scala-2.11", "classes", "mmlspark")
+  val destPyDir = join("target", s"scala-${scalaMajorVersion}", "classes", "mmlspark")
   if (destPyDir.exists()) FileUtils.forceDelete(destPyDir)
   FileUtils.copyDirectory(join(pythonSrcDir.getAbsolutePath, "mmlspark"), destPyDir)
 
@@ -254,7 +259,7 @@ testPythonTask := {
       "--cov-report=xml",
       "mmlsparktest"
     ),
-    new File("target/scala-2.11/generated/test/python/"),
+    new File(s"target/scala-${scalaMajorVersion}/generated/test/python/")
   ) ! s.log
 }
 
@@ -263,7 +268,7 @@ val datasetName = "datasets-2020-08-27.tgz"
 val datasetUrl = new URL(s"https://mmlspark.blob.core.windows.net/installers/$datasetName")
 val datasetDir = settingKey[File]("The directory that holds the dataset")
 datasetDir := {
-  join(target.value.toString, "scala-2.11", "datasets", datasetName.split(".".toCharArray.head).head)
+  join(target.value.toString, s"scala-${scalaMajorVersion}", "datasets", datasetName.split(".".toCharArray.head).head)
 }
 
 getDatasetsTask := {
@@ -281,19 +286,19 @@ genBuildInfo := {
 
   val buildInfo =
     s"""
-       |MMLSpark Build and Release Information
-       |---------------
-       |
-       |### Maven Coordinates
-       | `${organization.value}:${name.value}_2.11:${version.value}`
-       |
-       |### Maven Resolver
-       | `https://mmlspark.azureedge.net/maven`
-       |
-       |### Documentation Pages:
-       |[Scala Documentation](https://mmlspark.blob.core.windows.net/docs/${version.value}/scala/index.html)
-       |[Python Documentation](https://mmlspark.blob.core.windows.net/docs/${version.value}/pyspark/index.html)
-       |
+      |MMLSpark Build and Release Information
+      |---------------
+      |
+      |### Maven Coordinates
+      | `${organization.value}:${name.value}_${scalaMajorVersion}:${version.value}`
+      |
+      |### Maven Resolver
+      | `https://mmlspark.azureedge.net/maven`
+      |
+      |### Documentation Pages:
+      |[Scala Documentation](https://mmlspark.blob.core.windows.net/docs/${version.value}/scala/index.html)
+      |[Python Documentation](https://mmlspark.blob.core.windows.net/docs/${version.value}/pyspark/index.html)
+      |
     """.stripMargin
 
   val infoFile = join("target", "Build.md")
@@ -375,7 +380,7 @@ val settings = Seq(
     case x => MergeStrategy.first
   },
   assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false),
-  buildInfoPackage := "com.microsoft.ml.spark.build") //++
+  buildInfoPackage := "com.microsoft.ml.spark.build")
 
 lazy val mmlspark = (project in file("."))
   .enablePlugins(BuildInfoPlugin)
@@ -425,4 +430,3 @@ pgpPublicRing := {
 dynverSonatypeSnapshots in ThisBuild := true
 dynverSeparator in ThisBuild := "-"
 publishTo := sonatypePublishToBundle.value
-
diff --git a/docs/developer-readme.md b/docs/developer-readme.md
@@ -1,7 +1,7 @@
 # MMLSpark Development Setup
 
 1) [Install SBT](https://www.scala-sbt.org/1.x/docs/Setup.html)
-    - Make sure to download JDK 1.8 if you don't have it
+    - Make sure to download JDK 11 if you don't have it
 3) Fork the repository on github
     - This is required if you would like to make PRs. If you choose the fork option, replace the clone link below with that of your fork.
 2) Git Clone your fork, or the repo directly

diff --git a/environment.yaml b/environment.yaml
@@ -1,7 +1,9 @@
 name: mmlspark
+channels:
+  - conda-forge
 dependencies:
   - python=3.6
-  - pyspark=2.4.3
+  - pyspark=3.0.1
   - requests
   - pip:
     - wheel

diff --git a/notebooks/samples/CyberML - Anomalous Access Detection.ipynb b/notebooks/samples/CyberML - Anomalous Access Detection.ipynb
diff --git a/notebooks/samples/DeepLearning - BiLSTM Medical Entity Extraction.ipynb b/notebooks/samples/DeepLearning - BiLSTM Medical Entity Extraction.ipynb
@@ -174,10 +174,16 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
    "outputs": [],
    "source": [
-    "tokenizeUDF = udf(word_tokenize, ArrayType(StringType()))\n",
+    "def safe_tokenize(sent):\n",
+    "    try:\n",
+    "        return word_tokenize(sent)\n",
+    "    except LookupError:\n",
+    "        prepNLTK(None)\n",
+    "        return word_tokenize(sent)\n",
+    "\n",
+    "tokenizeUDF = udf(safe_tokenize, ArrayType(StringType()))\n",
     "df = df.withColumn(\"tokens\",tokenizeUDF(\"sentence\"))\n",
     "\n",
     "countUDF = udf(len, IntegerType())\n",
@@ -203,9 +209,15 @@
     "featurizeUDF = udf(featurize,  ArrayType(FloatType()))\n",
     "\n",
     "df = df.withColumn(\"features\", featurizeUDF(\"tokens\")).cache()\n",
-    "safe_show(df, 3) # Can be flaky on build server\n",
+    "safe_show(df, 5) # Can be flaky on build server\n",
     "    \n"
-   ]
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
   },
   {
    "cell_type": "markdown",
@@ -338,15 +350,6 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.6.3"
-  },
-  "pycharm": {
-   "stem_cell": {
-    "cell_type": "raw",
-    "source": [],
-    "metadata": {
-     "collapsed": false
-    }
-   }
   }
  },
  "nbformat": 4,

diff --git a/notebooks/samples/LightGBM - Quantile Regression for Drug Discovery.ipynb b/notebooks/samples/LightGBM - Quantile Regression for Drug Discovery.ipynb
@@ -94,8 +94,8 @@
    "outputs": [],
    "source": [
     "from mmlspark.lightgbm import LightGBMRegressionModel\n",
-    "model.saveNativeModel(\"mymodel\")\n",
-    "model = LightGBMRegressionModel.loadNativeModelFromFile(\"mymodel\")"
+    "model.saveNativeModel(\"/mymodel\")\n",
+    "model = LightGBMRegressionModel.loadNativeModelFromFile(\"/mymodel\")"
    ]
   },
   {
@@ -175,4 +175,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
diff --git a/notebooks/samples/Regression - Flight Delays with DataCleaning.ipynb b/notebooks/samples/Regression - Flight Delays with DataCleaning.ipynb
@@ -153,7 +153,7 @@
     "testCat  = DataConversion(cols=[\"Carrier\",\"DepTimeBlk\",\"ArrTimeBlk\"],\n",
     "                          convertTo=\"toCategorical\") \\\n",
     "               .transform(test)\n",
-    "lr = LinearRegression().setSolver(\"l-bfgs\").setRegParam(0.1) \\\n",
+    "lr = LinearRegression().setRegParam(0.1) \\\n",
     "                       .setElasticNetParam(0.3)\n",
     "model = TrainRegressor(model=lr, labelCol=\"ArrDelay\").fit(trainCat)"
    ]
@@ -232,15 +232,6 @@
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.6.3"
-  },
-  "pycharm": {
-   "stem_cell": {
-    "cell_type": "raw",
-    "source": [],
-    "metadata": {
-     "collapsed": false
-    }
-   }
   }
  },
  "nbformat": 4,