Skip to content

Commit

Permalink
Release cut 1 1 (#92)
Browse files Browse the repository at this point in the history
* Release Cut 1.1.0

- Remove Native unused stuff with dependencies
- Add nightly DJL Sentence Piece to include Windows Support
- Clean up code with TODOs
  • Loading branch information
Lundez authored Jan 3, 2022
1 parent f7d6714 commit e988168
Show file tree
Hide file tree
Showing 15 changed files with 41 additions and 235 deletions.
10 changes: 0 additions & 10 deletions .github/workflows/create_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,3 @@ jobs:
- Second Change
draft: false
prerelease: false
#- name: Upload Release Asset
# id: upload-release-asset
# uses: actions/upload-release-asset@v1.0.1
# env:
# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
# with:
# upload_url: ${{ steps.create_release.outputs.upload_url }} # This pulls from the CREATE RELEASE step above, referencing it's ID to get its outputs object, which include a `upload_url`. See this blog post for more info: https://jasonet.co/posts/new-features-of-github-actions/#passing-data-to-future-steps
# asset_path: ./my-artifact.zip
# asset_name: my-artifact.zip
# asset_content_type: application/zip
8 changes: 3 additions & 5 deletions build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@ plugins {
}

group = "com.londogard"
version = "1.1.0-BETA"
version = "1.1.0"

repositories {
mavenCentral()
jcenter()
maven("https://oss.sonatype.org/content/repositories/snapshots")
maven("https://jitpack.io")
}

Expand All @@ -34,10 +35,7 @@ dependencies {
implementation("org.jetbrains.kotlinx:multik-jvm:0.1.1")

// DJL
implementation("ai.djl:api:0.14.0")
implementation("ai.djl.pytorch:pytorch-engine:0.14.0")
implementation("ai.djl.pytorch:pytorch-native-auto:1.9.1")
implementation("ai.djl.sentencepiece:sentencepiece:0.14.0")
implementation("ai.djl.sentencepiece:sentencepiece:0.15.0-SNAPSHOT")

implementation("com.github.ben-manes.caffeine:caffeine:3.0.5")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@ import org.jetbrains.kotlinx.multik.ndarray.data.MultiArray
import org.jetbrains.kotlinx.multik.ndarray.operations.*
import kotlin.math.ln

// TODO add L2/L1 regularization
// TODO add potential Intercept
class LogisticLoss: Loss {
//J = 1/m*sum(dot(-y,log(sigmoid(X*theta)))-dot(1-y,log(1-sigmoid(X*theta))));
override fun loss(weights: D2Array<Float>, X: MultiArray<Float, D2>, y: D2Array<Float>): Float {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,50 +2,35 @@ package com.londogard.nlp.meachinelearning.metrics

import com.londogard.nlp.meachinelearning.inplaceOp
import org.jetbrains.kotlinx.multik.ndarray.data.D2Array
import org.jetbrains.kotlinx.multik.ndarray.data.view
import org.jetbrains.kotlinx.multik.ndarray.data.compareTo
import org.jetbrains.kotlinx.multik.ndarray.data.get
import org.jetbrains.kotlinx.multik.ndarray.operations.average
import org.jetbrains.kotlinx.multik.ndarray.operations.minus
import org.jetbrains.kotlinx.multik.ndarray.operations.sum
import kotlin.math.pow

object Metrics {
fun confusionMatrix() {
TODO("")
}

fun f1() {
TODO("")
}

// precision = TP / (TP + FP)
// Precision = Sum c in C TruePositives_c / Sum c in C (TruePositives_c + FalsePositives_c)
fun <T: Number> precision(test: D2Array<T>, predicted: D2Array<T>) {
TODO("")
}

// recall = TP / (TP + FN)
fun <T: Number> recall(test: D2Array<T>, predicted: D2Array<T>) {
TODO("")
}

fun rSquared(test: D2Array<Float>, predicted: D2Array<Float>): Float {
requireSameShape(test, predicted)
val sst = (test - test.average().toFloat()).inplaceOp { it.pow(2) }.sum()
val ssr = (test - predicted).inplaceOp { it.pow(2) }.sum()

return (sst - ssr) / sst //
}


// accuracy = TP + FP / (TP + FP + FN + TN)
fun <T : Number> accuracy(test: D2Array<T>, predicted: D2Array<T>): Double {
require(test.shape.toList() == predicted.shape.toList()) {
"The size of testWords list doesn't match the size of the testTags list"
}
var correct = 0
for (row in 0 until test.shape[0]) {
if(test.view(row) == predicted.view(row)) { correct += 1 }
requireSameShape(test, predicted)
val correctPredictions = (0 until test.shape[0]).fold(0) {
acc, row ->
if (test[row] == predicted[row]) acc + 1 else acc
}

return correct.toDouble() / test.shape[0]
return correctPredictions / test.shape[0].toDouble()
}

private fun <T : Number> requireSameShape(test: D2Array<T>, predicted: D2Array<T>) =
require(test.shape.contentEquals(predicted.shape)) {
"The size of testWords list doesn't match the size of the testTags list"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ import org.jetbrains.kotlinx.multik.ndarray.operations.minus
import org.jetbrains.kotlinx.multik.ndarray.operations.times
import kotlin.math.abs

// TODO optimizer can figure out the yPredicted to save one iteration of predicts!
class GradientDescent(
val maxIterations: Int,
val stepSize: Float,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import org.jetbrains.kotlinx.multik.ndarray.data.*
import org.jetbrains.kotlinx.multik.ndarray.operations.*
import kotlin.math.ln

// TODO add multi-class Naive Bayes
// https://en.wikipedia.org/wiki/Naive_Bayes_classifier
// This variant uses Laplacian Smoothing
class NaiveBayes: Classifier {
Expand Down

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

22 changes: 14 additions & 8 deletions src/main/kotlin/com/londogard/nlp/preprocessing/Preprocessor.kt
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,20 @@ import com.londogard.nlp.tokenizer.TokenizerSpecialTokens.NumberStr

object Preprocessor {
fun replaceNumber(text: String): String = NumberPattern.replace(text, NumberStr)
fun replaceAllCaps(text: String): String = TODO("")

// Twitter related functions -- Might be removed (!)
fun String.removeTickers() = replace(Regex("\\\$\\w*"), "")
fun String.removeRTs() = replace(Regex("^RT[\\s]+"), "")
fun String.removeURLs() = replace(Regex("https?://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]"), "")
fun String.removeHashtags() = replace("#", "")
fun String.removeMentions() = replace(Regex("[@#][\\w_-]+"), "")
fun String.removeXMLEncodings() = replace(Regex("&[a-z]*;")," ")
fun String.removeExtraSpaces() = replace(Regex("\\s+")," ")
fun String.removeTickers(): String = TickerPattern.replace(this, "")
fun String.removeRTs(): String = RetweetPattern.replace(this, "")
fun String.removeURLs(): String = URLPattern.replace(this, "")
fun String.removeHashtags(): String = replace("#", "")
fun String.removeMentions(): String = MentionPattern.replace(this, "")
fun String.removeXMLEncodings(): String = XMLPattern.replace(this, " ")
fun String.removeExtraSpaces(): String = WhiteSpacePattern.replace(this, " ")

private val TickerPattern by lazy { Regex("\\\$\\w*") }
private val RetweetPattern by lazy { Regex("^RT[\\s]+") }
private val MentionPattern by lazy { Regex("[@#][\\w_-]+") }
private val URLPattern by lazy { Regex("https?://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]") }
private val XMLPattern by lazy { Regex("&[a-z]*;") }
private val WhiteSpacePattern by lazy { Regex("\\s+") }
}
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
package com.londogard.nlp.tokenizer.sentence

import com.londogard.nlp.preprocessing.Preprocessor.removeExtraSpaces
import com.londogard.nlp.tokenizer.Tokenizer

class SimpleSentenceTokenizer: Tokenizer {
override fun split(text: String): List<String> {
return text
.split(sentence)
.map { it.replace(multiSpaces, " ") }
.split(sentenceRegex)
.map { sentence -> sentence.removeExtraSpaces() }
}

companion object {
val multiSpaces: Regex = Regex("\\s+")
val sentence: Regex = Regex("([.!?]\\s|[\n\r]+)")
val sentenceRegex: Regex = Regex("([.!?]\\s|[\n\r]+)")
}
}
6 changes: 0 additions & 6 deletions src/main/kotlin/com/londogard/nlp/utils/EjmlExtensions.kt
Original file line number Diff line number Diff line change
Expand Up @@ -224,12 +224,6 @@ operator fun SimpleMatrix.plusAssign(other: SimpleMatrix) = when (type to other.
else -> throw IllegalArgumentException("Cannot plusAssign other types")
}


// def similarities_vectorized2(vector_data):
// norms = np.linalg.norm(vector_data, axis=1)
// combs = np.fromiter(combinations(range(vector_data.shape[0]),2), dtype='i,i')
// similarities = (vector_data[combs['f0']]*vector_data[combs['f1']]).sum(axis=1)/norms[combs['f0']]/norms[combs['f1']]
// return combs, similarities
// x·y / (||x|| × ||y||) = (x / ||x||) · (y / ||y||)
fun FMatrixRMaj.cosineDistanceOneToMany(other: FMatrixRMaj): FMatrixRMaj {
val norms = this.normF()
Expand Down
Loading

0 comments on commit e988168

Please sign in to comment.