|
| 1 | +// |
| 2 | +// Copyright (c) Vatsal Manot |
| 3 | +// |
| 4 | + |
| 5 | +import Foundation |
| 6 | + |
| 7 | +extension ElevenLabs { |
| 8 | + public final class VoiceSettings: Codable, Sendable { |
| 9 | + |
| 10 | + public enum Setting: String, Codable, Sendable { |
| 11 | + case stability |
| 12 | + case similarityBoost = "similarity_boost" |
| 13 | + case styleExaggeration = "style" |
| 14 | + case speakerBoost = "use_speaker_boost" |
| 15 | + } |
| 16 | + |
| 17 | + /// Increasing stability will make the voice more consistent between re-generations, but it can also make it sounds a bit monotone. On longer text fragments it is recommended to lower this value. |
| 18 | + /// This is a double between 0 (more variable) and 1 (more stable). |
| 19 | + public let stability: Double |
| 20 | + |
| 21 | + /// Increasing the Similarity Boost setting enhances the overall voice clarity and targets speaker similarity. However, very high values can cause artifacts, so it is recommended to adjust this setting to find the optimal value. |
| 22 | + /// This is a double between 0 (Low) and 1 (High). |
| 23 | + public let similarityBoost: Double |
| 24 | + |
| 25 | + /// High values are recommended if the style of the speech should be exaggerated compared to the selected voice. Higher values can lead to more instability in the generated speech. Setting this to 0 will greatly increase generation speed and is the default setting. |
| 26 | + public let styleExaggeration: Double |
| 27 | + |
| 28 | + /// Boost the similarity of the synthesized speech and the voice at the cost of some generation speed. |
| 29 | + public let speakerBoost: Bool |
| 30 | + |
| 31 | + public init(stability: Double, |
| 32 | + similarityBoost: Double, |
| 33 | + styleExaggeration: Double, |
| 34 | + speakerBoost: Bool) { |
| 35 | + self.stability = max(0, min(1, stability)) |
| 36 | + self.similarityBoost = max(0, min(1, similarityBoost)) |
| 37 | + self.styleExaggeration = max(0, min(1, styleExaggeration)) |
| 38 | + self.speakerBoost = speakerBoost |
| 39 | + } |
| 40 | + |
| 41 | + public init(stability: Double? = nil, |
| 42 | + similarityBoost: Double? = nil, |
| 43 | + styleExaggeration: Double? = nil, |
| 44 | + speakerBoost: Bool? = nil) { |
| 45 | + self.stability = stability.map { max(0, min(1, $0)) } ?? 0.5 |
| 46 | + self.similarityBoost = similarityBoost.map { max(0, min(1, $0)) } ?? 0.75 |
| 47 | + self.styleExaggeration = styleExaggeration.map { max(0, min(1, $0)) } ?? 0 |
| 48 | + self.speakerBoost = speakerBoost ?? true |
| 49 | + } |
| 50 | + |
| 51 | + public convenience init(stability: Double) { |
| 52 | + self.init( |
| 53 | + stability: stability, |
| 54 | + similarityBoost: 0.75, |
| 55 | + styleExaggeration: 0, |
| 56 | + speakerBoost: true |
| 57 | + ) |
| 58 | + } |
| 59 | + |
| 60 | + public convenience init(similarityBoost: Double) { |
| 61 | + self.init( |
| 62 | + stability: 0.5, |
| 63 | + similarityBoost: similarityBoost, |
| 64 | + styleExaggeration: 0, |
| 65 | + speakerBoost: true |
| 66 | + ) |
| 67 | + } |
| 68 | + |
| 69 | + public convenience init(styleExaggeration: Double) { |
| 70 | + self.init( |
| 71 | + stability: 0.5, |
| 72 | + similarityBoost: 0.75, |
| 73 | + styleExaggeration: styleExaggeration, |
| 74 | + speakerBoost: true |
| 75 | + ) |
| 76 | + } |
| 77 | + |
| 78 | + public convenience init(speakerBoost: Bool) { |
| 79 | + self.init( |
| 80 | + stability: 0.5, |
| 81 | + similarityBoost: 0.75, |
| 82 | + styleExaggeration: 0, |
| 83 | + speakerBoost: speakerBoost |
| 84 | + ) |
| 85 | + } |
| 86 | + |
| 87 | + public func encode(to encoder: Encoder) throws { |
| 88 | + var container = encoder.container(keyedBy: CodingKeys.self) |
| 89 | + |
| 90 | + try container.encode(stability, forKey: .stability) |
| 91 | + try container.encode(similarityBoost, forKey: .similarityBoost) |
| 92 | + try container.encode(styleExaggeration, forKey: .styleExaggeration) |
| 93 | + try container.encode(speakerBoost, forKey: .speakerBoost) |
| 94 | + } |
| 95 | + } |
| 96 | +} |
| 97 | + |
0 commit comments