Release cut 1 1 (#92)

* Release Cut 1.1.0 - Remove Native unused stuff with dependencies - Add nightly DJL Sentence Piece to include Windows Support - Clean up code with TODOs
londogard · Jan 3, 2022 · e988168 · e988168
1 parent f7d6714
commit e988168
Show file tree

Hide file tree

Showing 15 changed files with 41 additions and 235 deletions.
diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml
@@ -26,13 +26,3 @@ jobs:
             - Second Change
           draft: false
           prerelease: false
-      #- name: Upload Release Asset
-      #  id: upload-release-asset 
-      #  uses: actions/upload-release-asset@v1.0.1
-      #  env:
-      #    GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      #  with:
-      #    upload_url: ${{ steps.create_release.outputs.upload_url }} # This pulls from the CREATE RELEASE step above, referencing it's ID to get its outputs object, which include a `upload_url`. See this blog post for more info: https://jasonet.co/posts/new-features-of-github-actions/#passing-data-to-future-steps 
-      #    asset_path: ./my-artifact.zip
-      #    asset_name: my-artifact.zip
-      #    asset_content_type: application/zip
diff --git a/build.gradle.kts b/build.gradle.kts
@@ -9,11 +9,12 @@ plugins {
 }
 
 group = "com.londogard"
-version = "1.1.0-BETA"
+version = "1.1.0"
 
 repositories {
     mavenCentral()
     jcenter()
+    maven("https://oss.sonatype.org/content/repositories/snapshots")
     maven("https://jitpack.io")
 }
 
@@ -34,10 +35,7 @@ dependencies {
     implementation("org.jetbrains.kotlinx:multik-jvm:0.1.1")
 
     // DJL
-    implementation("ai.djl:api:0.14.0")
-    implementation("ai.djl.pytorch:pytorch-engine:0.14.0")
-    implementation("ai.djl.pytorch:pytorch-native-auto:1.9.1")
-    implementation("ai.djl.sentencepiece:sentencepiece:0.14.0")
+    implementation("ai.djl.sentencepiece:sentencepiece:0.15.0-SNAPSHOT")
 
     implementation("com.github.ben-manes.caffeine:caffeine:3.0.5")
 

diff --git a/src/main/kotlin/com/londogard/nlp/meachinelearning/loss/LogisticLoss.kt b/src/main/kotlin/com/londogard/nlp/meachinelearning/loss/LogisticLoss.kt
@@ -9,8 +9,6 @@ import org.jetbrains.kotlinx.multik.ndarray.data.MultiArray
 import org.jetbrains.kotlinx.multik.ndarray.operations.*
 import kotlin.math.ln
 
-// TODO add L2/L1 regularization
-// TODO add potential Intercept
 class LogisticLoss: Loss {
     //J = 1/m*sum(dot(-y,log(sigmoid(X*theta)))-dot(1-y,log(1-sigmoid(X*theta))));
     override fun loss(weights: D2Array<Float>, X: MultiArray<Float, D2>, y: D2Array<Float>): Float {

diff --git a/src/main/kotlin/com/londogard/nlp/meachinelearning/metrics/Metrics.kt b/src/main/kotlin/com/londogard/nlp/meachinelearning/metrics/Metrics.kt
@@ -2,50 +2,35 @@ package com.londogard.nlp.meachinelearning.metrics
 
 import com.londogard.nlp.meachinelearning.inplaceOp
 import org.jetbrains.kotlinx.multik.ndarray.data.D2Array
-import org.jetbrains.kotlinx.multik.ndarray.data.view
+import org.jetbrains.kotlinx.multik.ndarray.data.compareTo
+import org.jetbrains.kotlinx.multik.ndarray.data.get
 import org.jetbrains.kotlinx.multik.ndarray.operations.average
 import org.jetbrains.kotlinx.multik.ndarray.operations.minus
 import org.jetbrains.kotlinx.multik.ndarray.operations.sum
 import kotlin.math.pow
 
 object Metrics {
-    fun confusionMatrix() {
-        TODO("")
-    }
-
-    fun f1() {
-        TODO("")
-    }
-
-    // precision = TP / (TP + FP)
-    // Precision = Sum c in C TruePositives_c / Sum c in C (TruePositives_c + FalsePositives_c)
-    fun <T: Number> precision(test: D2Array<T>, predicted: D2Array<T>) {
-        TODO("")
-    }
-
-    // recall = TP / (TP + FN)
-    fun <T: Number> recall(test: D2Array<T>, predicted: D2Array<T>) {
-        TODO("")
-    }
-
     fun rSquared(test: D2Array<Float>, predicted: D2Array<Float>): Float {
+        requireSameShape(test, predicted)
         val sst = (test - test.average().toFloat()).inplaceOp { it.pow(2) }.sum()
         val ssr = (test - predicted).inplaceOp { it.pow(2) }.sum()
 
         return (sst - ssr) / sst // R²
     }
 
-
     // accuracy = TP + FP / (TP + FP + FN + TN)
     fun <T : Number> accuracy(test: D2Array<T>, predicted: D2Array<T>): Double {
-        require(test.shape.toList() == predicted.shape.toList()) {
-            "The size of testWords list doesn't match the size of the testTags list"
-        }
-        var correct = 0
-        for (row in 0 until test.shape[0]) {
-            if(test.view(row) == predicted.view(row)) { correct += 1 }
+        requireSameShape(test, predicted)
+        val correctPredictions = (0 until test.shape[0]).fold(0) {
+                acc, row ->
+            if (test[row] == predicted[row]) acc + 1 else acc
         }
 
-        return correct.toDouble() / test.shape[0]
+        return  correctPredictions / test.shape[0].toDouble()
     }
+
+    private fun <T : Number> requireSameShape(test: D2Array<T>, predicted: D2Array<T>) =
+        require(test.shape.contentEquals(predicted.shape)) {
+            "The size of testWords list doesn't match the size of the testTags list"
+        }
 }
diff --git a/src/main/kotlin/com/londogard/nlp/meachinelearning/optimizer/GradientDescent.kt b/src/main/kotlin/com/londogard/nlp/meachinelearning/optimizer/GradientDescent.kt
@@ -8,7 +8,6 @@ import org.jetbrains.kotlinx.multik.ndarray.operations.minus
 import org.jetbrains.kotlinx.multik.ndarray.operations.times
 import kotlin.math.abs
 
-// TODO optimizer can figure out the yPredicted to save one iteration of predicts!
 class GradientDescent(
     val maxIterations: Int,
     val stepSize: Float,

diff --git a/src/main/kotlin/com/londogard/nlp/meachinelearning/predictors/classifiers/NaiveBayes.kt b/src/main/kotlin/com/londogard/nlp/meachinelearning/predictors/classifiers/NaiveBayes.kt
@@ -10,7 +10,6 @@ import org.jetbrains.kotlinx.multik.ndarray.data.*
 import org.jetbrains.kotlinx.multik.ndarray.operations.*
 import kotlin.math.ln
 
-// TODO add multi-class Naive Bayes
 // https://en.wikipedia.org/wiki/Naive_Bayes_classifier
 // This variant uses Laplacian Smoothing
 class NaiveBayes: Classifier {

diff --git a/...n/kotlin/com/londogard/nlp/meachinelearning/transformers/native/NativeTfIdfTransformer.kt b/...n/kotlin/com/londogard/nlp/meachinelearning/transformers/native/NativeTfIdfTransformer.kt
diff --git a/src/main/kotlin/com/londogard/nlp/meachinelearning/transformers/native/NativeTransformer.kt b/src/main/kotlin/com/londogard/nlp/meachinelearning/transformers/native/NativeTransformer.kt
diff --git a/...main/kotlin/com/londogard/nlp/meachinelearning/vectorizer/native/NativeCountVectorizer.kt b/...main/kotlin/com/londogard/nlp/meachinelearning/vectorizer/native/NativeCountVectorizer.kt
diff --git a/src/main/kotlin/com/londogard/nlp/meachinelearning/vectorizer/native/NativeVectorizer.kt b/src/main/kotlin/com/londogard/nlp/meachinelearning/vectorizer/native/NativeVectorizer.kt
diff --git a/src/main/kotlin/com/londogard/nlp/preprocessing/Preprocessor.kt b/src/main/kotlin/com/londogard/nlp/preprocessing/Preprocessor.kt
@@ -5,14 +5,20 @@ import com.londogard.nlp.tokenizer.TokenizerSpecialTokens.NumberStr
 
 object Preprocessor {
     fun replaceNumber(text: String): String = NumberPattern.replace(text, NumberStr)
-    fun replaceAllCaps(text: String): String = TODO("")
 
     // Twitter related functions -- Might be removed (!)
-    fun String.removeTickers() = replace(Regex("\\\$\\w*"), "")
-    fun String.removeRTs() = replace(Regex("^RT[\\s]+"), "")
-    fun String.removeURLs() = replace(Regex("https?://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]"), "")
-    fun String.removeHashtags() = replace("#", "")
-    fun String.removeMentions() = replace(Regex("[@#][\\w_-]+"), "")
-    fun String.removeXMLEncodings() = replace(Regex("&[a-z]*;")," ")
-    fun String.removeExtraSpaces() = replace(Regex("\\s+")," ")
+    fun String.removeTickers(): String = TickerPattern.replace(this, "")
+    fun String.removeRTs(): String = RetweetPattern.replace(this, "")
+    fun String.removeURLs(): String = URLPattern.replace(this, "")
+    fun String.removeHashtags(): String = replace("#", "")
+    fun String.removeMentions(): String = MentionPattern.replace(this, "")
+    fun String.removeXMLEncodings(): String = XMLPattern.replace(this, " ")
+    fun String.removeExtraSpaces(): String = WhiteSpacePattern.replace(this, " ")
+
+    private val TickerPattern by lazy { Regex("\\\$\\w*") }
+    private val RetweetPattern by lazy { Regex("^RT[\\s]+") }
+    private val MentionPattern by lazy { Regex("[@#][\\w_-]+") }
+    private val URLPattern by lazy { Regex("https?://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]") }
+    private val XMLPattern by lazy { Regex("&[a-z]*;") }
+    private val WhiteSpacePattern by lazy { Regex("\\s+") }
 }
diff --git a/src/main/kotlin/com/londogard/nlp/tokenizer/sentence/SimpleSentenceTokenizer.kt b/src/main/kotlin/com/londogard/nlp/tokenizer/sentence/SimpleSentenceTokenizer.kt
@@ -1,16 +1,16 @@
 package com.londogard.nlp.tokenizer.sentence
 
+import com.londogard.nlp.preprocessing.Preprocessor.removeExtraSpaces
 import com.londogard.nlp.tokenizer.Tokenizer
 
 class SimpleSentenceTokenizer: Tokenizer {
     override fun split(text: String): List<String> {
         return text
-            .split(sentence)
-            .map { it.replace(multiSpaces, " ") }
+            .split(sentenceRegex)
+            .map { sentence -> sentence.removeExtraSpaces() }
     }
 
     companion object {
-        val multiSpaces: Regex = Regex("\\s+")
-        val sentence: Regex = Regex("([.!?]\\s|[\n\r]+)")
+        val sentenceRegex: Regex = Regex("([.!?]\\s|[\n\r]+)")
     }
 }
diff --git a/src/main/kotlin/com/londogard/nlp/utils/EjmlExtensions.kt b/src/main/kotlin/com/londogard/nlp/utils/EjmlExtensions.kt
@@ -224,12 +224,6 @@ operator fun SimpleMatrix.plusAssign(other: SimpleMatrix) = when (type to other.
     else -> throw IllegalArgumentException("Cannot plusAssign other types")
 }
 
-
-// def similarities_vectorized2(vector_data):
-//    norms = np.linalg.norm(vector_data, axis=1)
-//    combs = np.fromiter(combinations(range(vector_data.shape[0]),2), dtype='i,i')
-//    similarities = (vector_data[combs['f0']]*vector_data[combs['f1']]).sum(axis=1)/norms[combs['f0']]/norms[combs['f1']]
-//    return combs, similarities
 // x·y / (||x|| × ||y||) = (x / ||x||) · (y / ||y||)
 fun FMatrixRMaj.cosineDistanceOneToMany(other: FMatrixRMaj): FMatrixRMaj {
     val norms = this.normF()