refactored elevenlabs voice settings and added tests

NatashaTheRobot · NatashaTheRobot · commit fb88e3373335 · 2024-06-01T09:31:16.000+05:30
diff --git a/Package.swift b/Package.swift
@@ -231,6 +231,17 @@ let package = Package(
             swiftSettings: [
                 .enableExperimentalFeature("AccessLevelOnImport")
             ]
+        ),
+        .testTarget(
+            name: "ElevenLabsTests",
+            dependencies: [
+                "AI",
+                "Swallow"
+            ],
+            path: "Tests/ElevenLabs",
+            swiftSettings: [
+                .enableExperimentalFeature("AccessLevelOnImport")
+            ]
         )
     ]
 )
diff --git a/Sources/ElevenLabs/Intramodular/ElevenLabs.APISpecification.swift b/Sources/ElevenLabs/Intramodular/ElevenLabs.APISpecification.swift
@@ -23,12 +23,12 @@ extension ElevenLabs.APISpecification {
             }
             
             let text: String
-            let voiceSettings: [String: JSON]
+            let voiceSettings: ElevenLabs.VoiceSettings
             let model: ElevenLabs.Model
             
             init(
                 text: String,
-                voiceSettings: [String: JSON],
+                voiceSettings: ElevenLabs.VoiceSettings,
                 model: ElevenLabs.Model
             ) {
                 self.text = text
diff --git a/Sources/ElevenLabs/Intramodular/ElevenLabs.Client.swift b/Sources/ElevenLabs/Intramodular/ElevenLabs.Client.swift
@@ -75,7 +75,7 @@ extension ElevenLabs.Client {
     public func speech(
         for text: String,
         voiceID: String,
-        voiceSettings: [String: JSON]? = nil,
+        voiceSettings: ElevenLabs.VoiceSettings,
         model: ElevenLabs.Model
     ) async throws -> Data {
         let request = try HTTPRequest(url: URL(string: "\(apiSpecification.host)/v1/text-to-speech/\(voiceID)")!)
@@ -86,10 +86,7 @@ extension ElevenLabs.Client {
             .jsonBody(
                 ElevenLabs.APISpecification.RequestBodies.SpeechRequest(
                     text: text,
-                    voiceSettings: voiceSettings ?? [
-                        "stability" : 0,
-                        "similarity_boost": 0
-                    ],
+                    voiceSettings: voiceSettings,
                     model: model
                 ),
                 keyEncodingStrategy: .convertToSnakeCase
diff --git a/Sources/ElevenLabs/Intramodular/ElevenLabs.VoiceSettings.swift b/Sources/ElevenLabs/Intramodular/ElevenLabs.VoiceSettings.swift
@@ -0,0 +1,97 @@
+//
+// Copyright (c) Vatsal Manot
+//
+
+import Foundation
+
+extension ElevenLabs {
+    public final class VoiceSettings: Codable, Sendable {
+        
+        public enum Setting: String, Codable, Sendable {
+            case stability
+            case similarityBoost = "similarity_boost"
+            case styleExaggeration = "style"
+            case speakerBoost = "use_speaker_boost"
+        }
+        
+        /// Increasing stability will make the voice more consistent between re-generations, but it can also make it sounds a bit monotone. On longer text fragments it is recommended to lower this value.
+        /// This is a double between 0 (more variable) and 1 (more stable).
+        public let stability: Double
+        
+        /// Increasing the Similarity Boost setting enhances the overall voice clarity and targets speaker similarity. However, very high values can cause artifacts, so it is recommended to adjust this setting to find the optimal value.
+        /// This is a double between 0 (Low) and 1 (High).
+        public let similarityBoost: Double
+        
+        /// High values are recommended if the style of the speech should be exaggerated compared to the selected voice. Higher values can lead to more instability in the generated speech. Setting this to 0 will greatly increase generation speed and is the default setting.
+        public let styleExaggeration: Double
+        
+        /// Boost the similarity of the synthesized speech and the voice at the cost of some generation speed.
+        public let speakerBoost: Bool
+        
+        public init(stability: Double,
+                    similarityBoost: Double,
+                    styleExaggeration: Double,
+                    speakerBoost: Bool) {
+            self.stability = max(0, min(1, stability))
+            self.similarityBoost = max(0, min(1, similarityBoost))
+            self.styleExaggeration = max(0, min(1, styleExaggeration))
+            self.speakerBoost = speakerBoost
+        }
+        
+        public init(stability: Double? = nil,
+                    similarityBoost: Double? = nil,
+                    styleExaggeration: Double? = nil,
+                    speakerBoost: Bool? = nil) {
+            self.stability = stability.map { max(0, min(1, $0)) } ?? 0.5
+            self.similarityBoost = similarityBoost.map { max(0, min(1, $0)) } ?? 0.75
+            self.styleExaggeration = styleExaggeration.map { max(0, min(1, $0)) } ?? 0
+            self.speakerBoost = speakerBoost ?? true
+        }
+        
+        public convenience init(stability: Double) {
+            self.init(
+                stability: stability,
+                similarityBoost: 0.75,
+                styleExaggeration: 0,
+                speakerBoost: true
+            )
+        }
+        
+        public convenience init(similarityBoost: Double) {
+            self.init(
+                stability: 0.5,
+                similarityBoost: similarityBoost,
+                styleExaggeration: 0,
+                speakerBoost: true
+            )
+        }
+        
+        public convenience init(styleExaggeration: Double) {
+            self.init(
+                stability: 0.5,
+                similarityBoost: 0.75,
+                styleExaggeration: styleExaggeration,
+                speakerBoost: true
+            )
+        }
+        
+        public convenience init(speakerBoost: Bool) {
+            self.init(
+                stability: 0.5,
+                similarityBoost: 0.75,
+                styleExaggeration: 0,
+                speakerBoost: speakerBoost
+            )
+        }
+        
+        public func encode(to encoder: Encoder) throws {
+            var container = encoder.container(keyedBy: CodingKeys.self)
+            
+            try container.encode(stability, forKey: .stability)
+            try container.encode(similarityBoost, forKey: .similarityBoost)
+            try container.encode(styleExaggeration, forKey: .styleExaggeration)
+            try container.encode(speakerBoost, forKey: .speakerBoost)
+        }
+    }
+}
+
diff --git a/Tests/ElevenLabs/Instramodular/SpeechTests.swift b/Tests/ElevenLabs/Instramodular/SpeechTests.swift
@@ -0,0 +1,35 @@
+//
+// Copyright (c) Vatsal Manot
+//
+
+import ElevenLabs
+import XCTest
+
+final class SpeechTests: XCTestCase {
+        
+    func testCreateSpeech() async throws {
+        
+        let text = "In a quiet, unassuming village nestled deep in a lush, verdant valley, young Elara leads a simple life, dreaming of adventure beyond the horizon. Her village is filled with ancient folklore and tales of mystical relics, but none capture her imagination like the legend of the Enchanted Amulet—a powerful artifact said to grant its bearer the ability to control time."
+        
+        let voiceID = "4v7HtLWqY9rpQ7Cg2GT4"
+        
+        let voiceSettings: ElevenLabs.VoiceSettings = .init(
+            stability: 0.5,
+            similarityBoost: 0.75,
+            styleExaggeration: 0,
+            speakerBoost: true)
+        
+        let model = ElevenLabs.Model.EnglishV1
+        
+        let speechData = try await client.speech(
+            for: text,
+            voiceID: voiceID,
+            voiceSettings: voiceSettings,
+            model: model
+        )
+        
+        XCTAssertFalse(speechData.isEmpty, "speechData should not be empty")
+        
+        _ = speechData
+    }
+}
diff --git a/Tests/ElevenLabs/module.swift b/Tests/ElevenLabs/module.swift
@@ -0,0 +1,15 @@
+//
+// Copyright (c) Vatsal Manot
+//
+
+import ElevenLabs
+
+public var ELEVENLABS_API_KEY: String {
+    ""
+}
+
+public var client: ElevenLabs.Client {
+    let client = ElevenLabs.Client(apiKey: ELEVENLABS_API_KEY)
+        
+    return client
+}

Original file line number	Diff line number	Diff line change
`@@ -231,6 +231,17 @@ let package = Package(`
`231`	`231`	`swiftSettings: [`
`232`	`232`	`.enableExperimentalFeature("AccessLevelOnImport")`
`233`	`233`	`]`
	`234`	`+ ),`
	`235`	`+ .testTarget(`
	`236`	`+ name: "ElevenLabsTests",`
	`237`	`+ dependencies: [`
	`238`	`+ "AI",`
	`239`	`+ "Swallow"`
	`240`	`+ ],`
	`241`	`+ path: "Tests/ElevenLabs",`
	`242`	`+ swiftSettings: [`
	`243`	`+ .enableExperimentalFeature("AccessLevelOnImport")`
	`244`	`+ ]`
`234`	`245`	`)`
`235`	`246`	`]`
`236`	`247`	`)`