Merge pull request #3 from locaal-ai/roy.revai_provider

Rev AI provider
locaal-ai · Dec 19, 2024 · 993f2ef · 993f2ef
2 parents fd27e65 + 3b1b163
commit 993f2ef
Show file tree

Hide file tree

Showing 13 changed files with 951 additions and 2 deletions.
diff --git a/.github/workflows/build-project.yaml b/.github/workflows/build-project.yaml
@@ -279,6 +279,14 @@ jobs:
           "pluginName=${ProductName}" >> $env:GITHUB_OUTPUT
           "pluginVersion=${ProductVersion}" >> $env:GITHUB_OUTPUT
 
+      - uses: actions/cache@v4
+        id: conan-cache
+        with:
+          path: C:\Users\runneradmin\.conan2\
+          key: ${{ runner.os }}-conan-cache-${{ needs.check-event.outputs.config }}
+          restore-keys: |
+            ${{ runner.os }}-conan-cache-
+
       - name: Build Plugin 🧱
         uses: ./.github/actions/build-plugin
         with:

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -64,7 +64,9 @@ target_sources(
           src/language-codes/language-codes.cpp
           src/cloud-providers/cloud-provider.cpp
           src/cloud-providers/clova/clova-provider.cpp
+          src/cloud-providers/deepgram/deepgram-provider.cpp
           src/cloud-providers/google/google-provider.cpp
+          src/cloud-providers/revai/revai-provider.cpp
           src/utils/ssl-utils.cpp
           src/utils/curl-helper.cpp
           src/timed-metadata/timed-metadata-utils.cpp)

diff --git a/cmake/common/compiler_common.cmake b/cmake/common/compiler_common.cmake
@@ -22,7 +22,6 @@ set(_obs_clang_c_options
     # cmake-format: sortable
     -fno-strict-aliasing
     -Wbool-conversion
-    -Wcomma
     -Wconstant-conversion
     -Wdeprecated-declarations
     -Wempty-body

diff --git a/cmake/macos/xcode.cmake b/cmake/macos/xcode.cmake
@@ -122,7 +122,7 @@ set(CMAKE_XCODE_ATTRIBUTE_ENABLE_STRICT_OBJC_MSGSEND YES)
 set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING YES_ERROR)
 set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_BOOL_CONVERSION YES)
 set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS YES)
-set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_COMMA YES)
+set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_COMMA NO)
 set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_CONSTANT_CONVERSION YES)
 set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_EMPTY_BODY YES)
 set(CMAKE_XCODE_ATTRIBUTE_CLANG_WARN_ENUM_CONVERSION YES)

diff --git a/src/cloud-providers/cloud-provider.cpp b/src/cloud-providers/cloud-provider.cpp
@@ -3,6 +3,8 @@
 #include "clova/clova-provider.h"
 #include "google/google-provider.h"
 #include "aws/aws_provider.h"
+#include "revai/revai-provider.h"
+#include "deepgram/deepgram-provider.h"
 
 std::shared_ptr<CloudProvider> createCloudProvider(const std::string &providerType,
 						   CloudProvider::TranscriptionCallback callback,
@@ -14,6 +16,10 @@ std::shared_ptr<CloudProvider> createCloudProvider(const std::string &providerTy
 		return std::make_unique<GoogleProvider>(callback, gf);
 	} else if (providerType == "aws") {
 		return std::make_unique<AWSProvider>(callback, gf);
+	} else if (providerType == "revai") {
+		return std::make_unique<RevAIProvider>(callback, gf);
+	} else if (providerType == "deepgram") {
+		return std::make_unique<DeepgramProvider>(callback, gf);
 	}
 
 	return nullptr; // Return nullptr if no matching provider is found

diff --git a/src/cloud-providers/deepgram/deepgram-provider.cpp b/src/cloud-providers/deepgram/deepgram-provider.cpp
@@ -0,0 +1,138 @@
+#include "deepgram-provider.h"
+#include <nlohmann/json.hpp>
+
+#include "language-codes/language-codes.h"
+
+using json = nlohmann::json;
+
+namespace http = beast::http;
+
+DeepgramProvider::DeepgramProvider(TranscriptionCallback callback, cloudvocal_data *gf_)
+	: CloudProvider(callback, gf_),
+	  ioc(),
+	  ssl_ctx(ssl::context::tlsv12_client),
+	  resolver(ioc),
+	  ws(ioc, ssl_ctx)
+{
+	needs_results_thread = true; // We need a separate thread for reading results
+}
+
+bool DeepgramProvider::init()
+{
+	try {
+		// Setup SSL context
+		ssl_ctx.set_verify_mode(ssl::verify_peer);
+		ssl_ctx.set_default_verify_paths();
+
+		// Resolve the Deepgram endpoint
+		auto const results = resolver.resolve("api.deepgram.com", "443");
+
+		// Connect to Deepgram
+		net::connect(get_lowest_layer(ws), results);
+
+		// Set SNI hostname (required for TLS)
+		if (!SSL_set_tlsext_host_name(ws.next_layer().native_handle(),
+					      "api.deepgram.com")) {
+			throw beast::system_error(
+				beast::error_code(static_cast<int>(::ERR_get_error()),
+						  net::error::get_ssl_category()),
+				"Failed to set SNI hostname");
+		}
+
+		// Perform SSL handshake
+		ws.next_layer().handshake(ssl::stream_base::client);
+
+		// Set up WebSocket handshake with API key
+		ws.set_option(
+			websocket::stream_base::decorator([this](websocket::request_type &req) {
+				req.set(http::field::sec_websocket_protocol,
+					"token, " + std::string(gf->cloud_provider_api_key));
+			}));
+
+		std::string query = std::string("/v1/listen?encoding=linear16&sample_rate=16000") +
+				    "&language=" + language_codes_from_underscore[gf->language];
+		// Perform WebSocket handshake
+		ws.handshake("api.deepgram.com", query);
+
+		obs_log(LOG_INFO, "Connected to Deepgram WebSocket successfully");
+		return true;
+	} catch (std::exception const &e) {
+		obs_log(LOG_ERROR, "Error initializing Deepgram connection: %s", e.what());
+		return false;
+	}
+}
+
+void DeepgramProvider::sendAudioBufferToTranscription(const std::deque<float> &audio_buffer)
+{
+	if (audio_buffer.empty())
+		return;
+
+	try {
+		// Convert float audio to int16_t (linear16 format)
+		std::vector<int16_t> pcm_data;
+		pcm_data.reserve(audio_buffer.size());
+
+		for (float sample : audio_buffer) {
+			// Clamp and convert to int16
+			float clamped = std::max(-1.0f, std::min(1.0f, sample));
+			pcm_data.push_back(static_cast<int16_t>(clamped * 32767.0f));
+		}
+
+		// Send binary message
+		ws.write(net::buffer(pcm_data.data(), pcm_data.size() * sizeof(int16_t)));
+
+	} catch (std::exception const &e) {
+		obs_log(LOG_ERROR, "Error sending audio to Deepgram: %s", e.what());
+		running = false;
+	}
+}
+
+void DeepgramProvider::readResultsFromTranscription()
+{
+	try {
+		// Read message into buffer
+		beast::flat_buffer buffer;
+		ws.read(buffer);
+
+		// Convert to string and parse JSON
+		std::string msg = beast::buffers_to_string(buffer.data());
+		json result = json::parse(msg);
+
+		// Check if this is a transcription result
+		if (result["type"] == "Results" && !result["channel"]["alternatives"].empty()) {
+			DetectionResultWithText detection_result;
+
+			// Fill the detection result structure
+			detection_result.text = result["channel"]["alternatives"][0]["transcript"];
+			detection_result.result = result["is_final"] ? DETECTION_RESULT_SPEECH
+								     : DETECTION_RESULT_PARTIAL;
+
+			// If there are words with timestamps
+			if (!result["channel"]["alternatives"][0]["words"].empty()) {
+				auto &words = result["channel"]["alternatives"][0]["words"];
+				detection_result.start_timestamp_ms = words[0]["start"];
+				detection_result.end_timestamp_ms = words[words.size() - 1]["end"];
+			}
+
+			// Send result through callback
+			transcription_callback(detection_result);
+		}
+	} catch (std::exception const &e) {
+		obs_log(LOG_ERROR, "Error reading from Deepgram: %s", e.what());
+	}
+}
+
+void DeepgramProvider::shutdown()
+{
+	try {
+		// Send close message
+		ws.write(net::buffer(R"({"type":"CloseStream"})"));
+
+		// Close WebSocket connection
+		ws.close(websocket::close_code::normal);
+
+		obs_log(LOG_INFO, "Deepgram connection closed successfully");
+	} catch (std::exception const &e) {
+		obs_log(LOG_ERROR, "Error during Deepgram shutdown: %s", e.what());
+	}
+}
diff --git a/src/cloud-providers/deepgram/deepgram-provider.h b/src/cloud-providers/deepgram/deepgram-provider.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <boost/beast/core.hpp>
+#include <boost/beast/websocket.hpp>
+#include <boost/asio/strand.hpp>
+#include <boost/beast/core/tcp_stream.hpp>
+#include <boost/beast/ssl.hpp>
+#include "cloud-providers/cloud-provider.h"
+
+namespace beast = boost::beast;
+namespace websocket = beast::websocket;
+namespace net = boost::asio;
+namespace ssl = boost::asio::ssl;
+using tcp = boost::asio::ip::tcp;
+
+class DeepgramProvider : public CloudProvider {
+public:
+	DeepgramProvider(TranscriptionCallback callback, cloudvocal_data *gf_);
+	bool init() override;
+
+protected:
+	void sendAudioBufferToTranscription(const std::deque<float> &audio_buffer) override;
+	void readResultsFromTranscription() override;
+	void shutdown() override;
+
+private:
+	net::io_context ioc;
+	ssl::context ssl_ctx;
+	tcp::resolver resolver;
+	websocket::stream<beast::ssl_stream<tcp::socket>> ws;
+};
diff --git a/src/cloud-providers/deepgram/live_audio_api.md b/src/cloud-providers/deepgram/live_audio_api.md
@@ -0,0 +1,156 @@
+Transcribe - Live audio
+Use Deepgram's speech-to-text API to transcribe live-streaming audio.
+
+Deepgram provides its customers with real-time, streaming transcription via its streaming endpoints. These endpoints are high-performance, full-duplex services running over the WebSocket protocol.
+
+To learn more about working with real-time streaming data and results, see Get Started with Streaming Audio.
+
+Endpoint
+Production WebSocket server for Deepgram's real-time transcription with streaming audio. TLS encryption will protect your connection and data. We support a minimum of TLS 1.2.
+
+Detail	Description
+Path	wss://api.deepgram.com/v1/listen
+Accepts
+Type	Description
+Raw Audio File Data	Unprocessed or uncompressed binary audio data (such as PCM)
+Messages	JSON formatted operations.
+Headers
+Header	Value	Description
+Sec-WebSocket-Protocol	token, <DEEPGRAM_API_KEY>	Used to establish a WebSocket connection with a specific protocol, include your Deepgram API key for authentication.
+Body Params
+Parameter	Type	Description
+callback	string	Callback URL to provide if you would like your submitted audio to be processed asynchronously. Learn More.
+callback_method	string	Enable a callback method. Use put or post. Learn More.
+channels	int32	Number of independent audio channels contained in submitted streaming audio. Only read when a value is provided for encoding. Learn More.
+dictation	boolean	Dictation automatically formats spoken commands for punctuation into their respective punctuation marks. Learn More.
+diarize	boolean	Indicates whether to recognize speaker changes. When set to true, each word in the transcript will be assigned a speaker number starting at 0. Learn More.
+diarize_version	string	Indicates the version of the diarization feature to use. Only available when the diarization feature is enabled. Learn More.
+encoding	string	Expected encoding of the submitted streaming audio. If this parameter is set, sample_rate must also be specified. Learn More.
+endpointing	boolean	Indicates how long Deepgram will wait to detect whether a speaker has finished speaking or pauses for a significant period of time. When set to true, the streaming endpoint immediately finalizes the transcription for the processed time range and returns the transcript with a speech_final parameter set to true. Learn More.
+extra	string	Add any extra key-value pairs to the query string to customize the response. Learn More.
+filler_words	boolean	Indicates whether to include filler words like "uh" and "um" in transcript output. When set to true, these words will be included. Learn More.
+interim_results	boolean	Specifies whether the streaming endpoint should provide ongoing transcription updates as more audio is received. When set to true, the endpoint sends continuous updates, meaning transcription results may evolve over time. Learn More.
+keywords	string	Unique proper nouns or specialized terms you want the model to include in its predictions, which aren't part of the model's default vocabulary. Learn More.
+language	string	The BCP-47 language tag that hints at the primary spoken language. Learn More.
+model	string	The AI model used to process submitted audio. Learn More.
+multichannel	boolean	Indicates whether to transcribe each audio channel independently. Learn More.
+numerals	boolean	Indicates whether to convert numbers from written format (e.g., one) to numerical format (e.g., 1). Learn More.
+profanity_filter	boolean	Indicates whether to remove profanity from the transcript. Learn More.
+punctuate	boolean	Indicates whether to add punctuation and capitalization to the transcript Learn More.
+redact	string	Indicates whether to redact sensitive information, replacing redacted content with asterisks *. Learn More.
+replace	string	Terms or phrases to search for in the submitted audio and replace. Learn More.
+sample_rate	int32	Sample rate of submitted streaming audio. Required (and only read) when a value is provided for encoding. Learn More.
+search	string	Terms or phrases to search for in the submitted audio. Learn More.
+smart_format	boolean	Indicates whether to apply formatting to transcript output. When set to true, additional formatting will be applied to transcripts to improve readability. Learn More.
+tag	string	Set a tag to associate with the request. Learn More.
+utterance_end_ms	string	Indicates how long Deepgram will wait to send a {"type": "UtteranceEnd"} message after a word has been transcribed. Learn More.
+vad_events	boolean	Indicates that speech has started. {"type": "SpeechStarted"} You'll begin receiving messages upon speech starting. Learn More.
+version	string	Version of the model to use. Learn More.
+Sending Audio Data
+All audio data is transmitted to the streaming endpoint as binary WebSocket messages, with payloads containing the raw audio data. The full-duplex protocol allows for real-time streaming, enabling you to receive transcription responses simultaneously as you upload data. For optimal performance, each streaming buffer should represent between 20 and 250 milliseconds of audio.
+
+Messages
+Keep Alive
+Optional
+
+Periodically send KeepAlive messages while streaming can ensure uninterrupted communication and minimizing costs. Learn More .
+
+JSON
+
+{
+  "type": "KeepAlive"
+}
+Finalize
+Optional
+
+Finalize message can be used to handle specific scenarios where you need to force the server to process all unprocessed audio data and immediately return the final results. Learn More.
+
+JSON
+
+{
+  "type": "Finalize"
+}
+Close Stream
+Optional
+
+The CloseStream message can be sent to the Deepgram server, instructing it close the connection. Learn More.
+
+JSON
+
+{
+  "type": "CloseStream"
+}
+Responses
+Refer to API Errors for more information.
+
+Status	Description
+200	Audio submitted for transcription.
+400	Bad Request.
+401	Invalid Authorization.
+402	Payment Required, insufficient credits
+403	Insufficient permissions.
+503	Internal server error if the server is temporarily unable to serve requests.
+Response Schema
+JSON
+
+{
+  "metadata": {
+    "transaction_key": "string",
+    "request_id": "uuid",
+    "sha256": "string",
+    "created": "string",
+    "duration": 0,
+    "channels": 0,
+    "models": [
+      "string"
+    ],
+  },
+  "type": "Results",
+  "channel_index": [
+    0,
+    0
+  ],
+  "duration": 0.0,
+  "start": 0.0,
+  "is_final": boolean,
+  "speech_final": boolean,
+  "channel": {
+    "alternatives": [
+      {
+        "transcript": "string",
+        "confidence": 0,
+        "words": [
+          {
+            "word": "string",
+            "start": 0,
+            "end": 0,
+            "confidence": 0
+          }
+        ]
+      }
+    ],
+    "search": [
+      {
+        "query": "string",
+        "hits": [
+          {
+            "confidence": 0,
+            "start": 0,
+            "end": 0,
+            "snippet": "string"
+          }
+        ]
+      }
+    ]
+  }
+}
+Errors & Warnings
+If Deepgram encounters an error during real-time streaming, we will return a WebSocket Close frame. The body of the Close frame will indicate the reason for closing using one of the specification’s pre-defined status codes followed by a UTF-8-encoded payload that represents the reason for the error.
+
+Current codes and payloads in use include:
+
+Code	Payload	Description
+1000	N/A	Normal Closure
+1008	DATA-0000	The payload cannot be decoded as audio. Either the encoding is incorrectly specified, the payload is not audio data, or the audio is in a format unsupported by Deepgram.
+1011	NET-0000	The service has not transmitted a Text frame to the client within the timeout window. This may indicate an issue internally in Deepgram's systems or could be due to Deepgram not receiving enough audio data to transcribe a frame.
+1011	NET-0001	The service has not received a Binary or Text frame from the client within the timeout window. This may indicate an internal issue in Deepgram's systems, the client's systems, or the network connecting them.