fix: Fix read schemas (#988)

* fix: Fix read schemas * cache buster * squash * add python package publishing * squash
microsoft · Feb 26, 2021 · 0717ac4 · 0717ac4
1 parent 9cff1e6
commit 0717ac4
Show file tree

Hide file tree

Showing 5 changed files with 39 additions and 11 deletions.
diff --git a/build.sbt b/build.sbt
@@ -234,6 +234,17 @@ installPipPackageTask := {
     pythonPackageDir) ! s.log
 }
 
+val publishPython = TaskKey[Unit]("publishPython", "publish python wheel")
+publishPython := {
+  val s = streams.value
+  publishLocal.value
+  packagePythonTask.value
+  singleUploadToBlob(
+    join(pythonPackageDir.toString, s"${pythonizedVersion.value}-py2.py3-none-any.whl").toString,
+    version.value + s"/${pythonizedVersion.value}-py2.py3-none-any.whl",
+    "pip", s.log)
+}
+
 val testPythonTask = TaskKey[Unit]("testPython", "test python sdk")
 
 testPythonTask := {
@@ -419,5 +430,3 @@ pgpPublicRing := {
 dynverSonatypeSnapshots in ThisBuild := true
 dynverSeparator in ThisBuild := "-"
 publishTo := sonatypePublishToBundle.value
-
-// Cache Break 1
diff --git a/pipeline.yaml b/pipeline.yaml
@@ -59,7 +59,7 @@ jobs:
     - bash: |
         source activate mmlspark
         sbt packagePython
-        sbt publishBlob publishSigned publishDocs publishR
+        sbt publishBlob publishSigned publishDocs publishR publishPython
         sbt genBuildInfo
         echo "##vso[task.uploadsummary]$(pwd)/target/Build.md"
         sbt release

diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/OCRSchemas.scala b/src/main/scala/com/microsoft/ml/spark/cognitive/OCRSchemas.scala
@@ -39,13 +39,13 @@ case class ReadAnalyzeResult(version: String,
                              readResults: Seq[ReadResult])
 
 case class ReadResult(page: Int,
-                      language: String,
+                      language: Option[String],
                       angle: Double,
-                      width: Int,
-                      height: Int,
+                      width: Double,
+                      height: Double,
                       unit: String,
                       lines: Array[ReadLine])
 
-case class ReadLine(boundingBox: Array[Int], text: String, words: Array[ReadWord])
+case class ReadLine(boundingBox: Array[Double], text: String, words: Array[ReadWord])
 
-case class ReadWord(boundingBox: Array[Int], text: String, confidence: Double)
+case class ReadWord(boundingBox: Array[Double], text: String, confidence: Double)
diff --git a/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemas.scala b/src/main/scala/com/microsoft/ml/spark/cognitive/TextAnalyticsSchemas.scala
@@ -16,7 +16,7 @@ case class TARequest(documents: Seq[TADocument])
 
 object TARequest extends SparkBindings[TARequest]
 
-case class TAError(id: String, message: String)
+case class TAError(id: String, error: String)
 
 object TAError extends SparkBindings[TAError]
 

diff --git a/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala b/src/test/scala/com/microsoft/ml/spark/cognitive/split1/ComputerVisionSuite.scala
@@ -9,17 +9,19 @@ import com.microsoft.ml.spark.core.test.base.{Flaky, TestBase}
 import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
 import org.apache.spark.ml.NamespaceInjections.pipelineModel
 import org.apache.spark.ml.util.MLReadable
-import org.apache.spark.sql.functions.typedLit
+import org.apache.spark.sql.functions.{corr, typedLit}
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.scalactic.Equality
 import org.scalatest.Assertion
 import com.microsoft.ml.spark.FluentAPI._
+import com.microsoft.ml.spark.featurize.text.PageSplitter
 
 trait CognitiveKey {
   lazy val cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", Secrets.CognitiveApiKey)
 }
 
 trait OCRUtils extends TestBase {
+
   import session.implicits._
 
   lazy val df: DataFrame = Seq(
@@ -28,6 +30,10 @@ trait OCRUtils extends TestBase {
     "https://mmlspark.blob.core.windows.net/datasets/OCR/test3.png"
   ).toDF("url")
 
+  lazy val pdfDf: DataFrame = Seq(
+    "https://mmlspark.blob.core.windows.net/datasets/OCR/paper.pdf"
+  ).toDF("url")
+
   lazy val bytesDF: DataFrame = BingImageSearch
     .downloadFromUrls("url", "imageBytes", 4, 10000)
     .transform(df)
@@ -84,6 +90,7 @@ class OCRSuite extends TransformerFuzzing[OCR] with CognitiveKey with Flaky with
 
   override def reader: MLReadable[_] = OCR
 }
+
 class AnalyzeImageSuite extends TransformerFuzzing[AnalyzeImage] with CognitiveKey with Flaky {
 
   import session.implicits._
@@ -154,7 +161,7 @@ class AnalyzeImageSuite extends TransformerFuzzing[AnalyzeImage] with CognitiveK
     val fromRow = AIResponse.makeFromRowConverter
     val responses = ai.transform(df).select("features")
       .collect().toList.map(r =>
-        fromRow(r.getStruct(0)))
+      fromRow(r.getStruct(0)))
     assert(responses.head.categories.get.head.name === "others_")
     assert(responses(1).categories.get.head.name === "text_sign")
   }
@@ -257,6 +264,7 @@ class ReadSuite extends TransformerFuzzing[Read]
     def prep(df: DataFrame) = {
       df.select("url", "ocr.analyzeResult.readResults")
     }
+
     super.assertDFEq(prep(df1), prep(df2))(eq)
   }
 
@@ -269,6 +277,17 @@ class ReadSuite extends TransformerFuzzing[Read]
       headStr === "CLOSED WHEN ONE DOOR CLOSES, ANOTHER OPENS. ALL YOU HAVE TO DO IS WALK IN")
   }
 
+  test("Basic Usage with pdf") {
+    val results = pdfDf.mlTransform(read, Read.flatten("ocr", "ocr"))
+      .select("ocr")
+      .collect()
+    val headStr = results.head.getString(0)
+    val correctPrefix = "Full Tree Conditioned Tree Component Space " +
+      "Efficiency Measured Data O(n × d) 380 MB Tree O((2n/l) × d)"
+
+    assert(headStr.startsWith(correctPrefix))
+  }
+
   test("Basic Usage with Bytes") {
     val results = bytesDF.mlTransform(bytesRead, Read.flatten("ocr", "ocr"))
       .select("ocr")