Skip to content

Commit

Permalink
fix: Fix read schemas (#988)
Browse files Browse the repository at this point in the history
* fix: Fix read schemas

* cache buster

* squash

* add python package publishing

* squash
  • Loading branch information
mhamilton723 authored Feb 26, 2021
1 parent 9cff1e6 commit 0717ac4
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 11 deletions.
13 changes: 11 additions & 2 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,17 @@ installPipPackageTask := {
pythonPackageDir) ! s.log
}

val publishPython = TaskKey[Unit]("publishPython", "publish python wheel")
publishPython := {
val s = streams.value
publishLocal.value
packagePythonTask.value
singleUploadToBlob(
join(pythonPackageDir.toString, s"${pythonizedVersion.value}-py2.py3-none-any.whl").toString,
version.value + s"/${pythonizedVersion.value}-py2.py3-none-any.whl",
"pip", s.log)
}

val testPythonTask = TaskKey[Unit]("testPython", "test python sdk")

testPythonTask := {
Expand Down Expand Up @@ -419,5 +430,3 @@ pgpPublicRing := {
dynverSonatypeSnapshots in ThisBuild := true
dynverSeparator in ThisBuild := "-"
publishTo := sonatypePublishToBundle.value

// Cache Break 1
2 changes: 1 addition & 1 deletion pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ jobs:
- bash: |
source activate mmlspark
sbt packagePython
sbt publishBlob publishSigned publishDocs publishR
sbt publishBlob publishSigned publishDocs publishR publishPython
sbt genBuildInfo
echo "##vso[task.uploadsummary]$(pwd)/target/Build.md"
sbt release
Expand Down
10 changes: 5 additions & 5 deletions src/main/scala/com/microsoft/ml/spark/cognitive/OCRSchemas.scala
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,13 @@ case class ReadAnalyzeResult(version: String,
readResults: Seq[ReadResult])

case class ReadResult(page: Int,
language: String,
language: Option[String],
angle: Double,
width: Int,
height: Int,
width: Double,
height: Double,
unit: String,
lines: Array[ReadLine])

case class ReadLine(boundingBox: Array[Int], text: String, words: Array[ReadWord])
case class ReadLine(boundingBox: Array[Double], text: String, words: Array[ReadWord])

case class ReadWord(boundingBox: Array[Int], text: String, confidence: Double)
case class ReadWord(boundingBox: Array[Double], text: String, confidence: Double)
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ case class TARequest(documents: Seq[TADocument])

object TARequest extends SparkBindings[TARequest]

case class TAError(id: String, message: String)
case class TAError(id: String, error: String)

object TAError extends SparkBindings[TAError]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,19 @@ import com.microsoft.ml.spark.core.test.base.{Flaky, TestBase}
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import org.apache.spark.ml.NamespaceInjections.pipelineModel
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.functions.typedLit
import org.apache.spark.sql.functions.{corr, typedLit}
import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.scalactic.Equality
import org.scalatest.Assertion
import com.microsoft.ml.spark.FluentAPI._
import com.microsoft.ml.spark.featurize.text.PageSplitter

trait CognitiveKey {
lazy val cognitiveKey = sys.env.getOrElse("COGNITIVE_API_KEY", Secrets.CognitiveApiKey)
}

trait OCRUtils extends TestBase {

import session.implicits._

lazy val df: DataFrame = Seq(
Expand All @@ -28,6 +30,10 @@ trait OCRUtils extends TestBase {
"https://mmlspark.blob.core.windows.net/datasets/OCR/test3.png"
).toDF("url")

lazy val pdfDf: DataFrame = Seq(
"https://mmlspark.blob.core.windows.net/datasets/OCR/paper.pdf"
).toDF("url")

lazy val bytesDF: DataFrame = BingImageSearch
.downloadFromUrls("url", "imageBytes", 4, 10000)
.transform(df)
Expand Down Expand Up @@ -84,6 +90,7 @@ class OCRSuite extends TransformerFuzzing[OCR] with CognitiveKey with Flaky with

override def reader: MLReadable[_] = OCR
}

class AnalyzeImageSuite extends TransformerFuzzing[AnalyzeImage] with CognitiveKey with Flaky {

import session.implicits._
Expand Down Expand Up @@ -154,7 +161,7 @@ class AnalyzeImageSuite extends TransformerFuzzing[AnalyzeImage] with CognitiveK
val fromRow = AIResponse.makeFromRowConverter
val responses = ai.transform(df).select("features")
.collect().toList.map(r =>
fromRow(r.getStruct(0)))
fromRow(r.getStruct(0)))
assert(responses.head.categories.get.head.name === "others_")
assert(responses(1).categories.get.head.name === "text_sign")
}
Expand Down Expand Up @@ -257,6 +264,7 @@ class ReadSuite extends TransformerFuzzing[Read]
def prep(df: DataFrame) = {
df.select("url", "ocr.analyzeResult.readResults")
}

super.assertDFEq(prep(df1), prep(df2))(eq)
}

Expand All @@ -269,6 +277,17 @@ class ReadSuite extends TransformerFuzzing[Read]
headStr === "CLOSED WHEN ONE DOOR CLOSES, ANOTHER OPENS. ALL YOU HAVE TO DO IS WALK IN")
}

test("Basic Usage with pdf") {
val results = pdfDf.mlTransform(read, Read.flatten("ocr", "ocr"))
.select("ocr")
.collect()
val headStr = results.head.getString(0)
val correctPrefix = "Full Tree Conditioned Tree Component Space " +
"Efficiency Measured Data O(n × d) 380 MB Tree O((2n/l) × d)"

assert(headStr.startsWith(correctPrefix))
}

test("Basic Usage with Bytes") {
val results = bytesDF.mlTransform(bytesRead, Read.flatten("ocr", "ocr"))
.select("ocr")
Expand Down

0 comments on commit 0717ac4

Please sign in to comment.