diff --git a/app/jobs/regular/chatbot_topic_title_embedding.rb b/app/jobs/regular/chatbot_topic_title_embedding.rb new file mode 100644 index 00000000..718f5a43 --- /dev/null +++ b/app/jobs/regular/chatbot_topic_title_embedding.rb @@ -0,0 +1,20 @@ +# frozen_string_literal: true + +# Job is triggered on an update to a Post. +class ::Jobs::ChatbotTopicTitleEmbedding < Jobs::Base + sidekiq_options retry: 5, dead: false, queue: 'low' + + def execute(opts) + begin + topic_id = opts[:id] + + ::DiscourseChatbot.progress_debug_message("100. Creating/updating a Topic Title Embedding for Topic id: #{topic_id}") + + process_topic_title_embedding = ::DiscourseChatbot::TopicTitleEmbeddingProcess.new + + process_topic_title_embedding.upsert(topic_id) + rescue => e + Rails.logger.error("Chatbot: Topic Title Embedding: There was a problem, but will retry til limit: #{e}") + end + end +end diff --git a/app/jobs/regular/chatbot_topic_title_embedding_delete.rb b/app/jobs/regular/chatbot_topic_title_embedding_delete.rb new file mode 100644 index 00000000..c8debb16 --- /dev/null +++ b/app/jobs/regular/chatbot_topic_title_embedding_delete.rb @@ -0,0 +1,18 @@ +# frozen_string_literal: true + +# Job is triggered on a Topic destruction. +class ::Jobs::ChatbotTopicTitleEmbeddingDelete < Jobs::Base + sidekiq_options retry: false + + def execute(opts) + begin + topic_id = opts[:id] + + ::DiscourseChatbot.progress_debug_message("101. Deleting a Topic Title Embedding for Topic id: #{topic_id}") + + ::DiscourseChatbot::TopicTitleEmbedding.find_by(topic_id: topic_id).destroy! + rescue => e + Rails.logger.error("Chatbot: Topic Title Embedding: There was a problem, but will retry til limit: #{e}") + end + end +end diff --git a/app/models/discourse_chatbot/topic_embeddings_bookmark.rb b/app/models/discourse_chatbot/topic_embeddings_bookmark.rb new file mode 100644 index 00000000..e18d8fce --- /dev/null +++ b/app/models/discourse_chatbot/topic_embeddings_bookmark.rb @@ -0,0 +1,9 @@ +# frozen_string_literal: true + +module ::DiscourseChatbot + class TopicEmbeddingsBookmark < ActiveRecord::Base + self.table_name = 'chatbot_topic_embeddings_bookmark' + + validates :topic_id, presence: true + end +end diff --git a/app/models/discourse_chatbot/topic_title_embedding.rb b/app/models/discourse_chatbot/topic_title_embedding.rb new file mode 100644 index 00000000..fcdb0d4c --- /dev/null +++ b/app/models/discourse_chatbot/topic_title_embedding.rb @@ -0,0 +1,9 @@ +# frozen_string_literal: true + +module ::DiscourseChatbot + class TopicTitleEmbedding < ActiveRecord::Base + self.table_name = 'chatbot_topic_title_embeddings' + + validates :topic_id, presence: true, uniqueness: true + end +end diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index 4b804ff5..0a98cbb2 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -60,6 +60,7 @@ en: chatbot_forum_search_function_max_results: "Maximum number of Posts shown to bot from a forum search" chatbot_forum_search_function_similarity_threshold: "Fraction of 1. The point at which a Post is included in the results is defined by how similar it is to the query with 1 being identical and zero being nothing like the query" chatbot_forum_search_function_reranking_group_promotion: "Posts authored by these Groups are promoted in results set so their Posts are more likely to be seen and taken into account by the bot" + chatbot_forum_search_function_include_topic_titles: "Include Topic titles in the search results" chatbot_forum_search_function_results_content_type: "The scope of content to be returned in the search results. Choose 'posts' for just ranking Posts, 'topics' for the entire Topics that contain those ranked Posts" chatbot_forum_search_function_results_topic_max_posts_count_strategy: "The strategy used to determine the maximum number of Posts to be returned in the search results if content_type is 'topics'. Choose 'all' for all Posts, 'just_enough' to limit the Posts to only those up to including the ranked Post, 'stretch_if_required' to include all Posts up to the ranked Post regardless of the max setting, 'exact' for exactly the number of Posts specified in the max setting" chatbot_forum_search_function_results_topic_max_posts_count: "The maximum number of Posts to be returned in the search results if content_type is 'topics'" diff --git a/config/settings.yml b/config/settings.yml index a7424a97..0103d7f3 100644 --- a/config/settings.yml +++ b/config/settings.yml @@ -265,6 +265,9 @@ plugins: default: "" allow_any: false refresh: true + chatbot_forum_search_function_include_topic_titles: + client: false + default: false chatbot_forum_search_function_results_content_type: client: false default: "post" diff --git a/db/migrate/20230820010103_create_chatbot_embeddings_table.rb b/db/migrate/20230820010103_create_chatbot_embeddings_table.rb index 37bc53e6..d59ecfed 100644 --- a/db/migrate/20230820010103_create_chatbot_embeddings_table.rb +++ b/db/migrate/20230820010103_create_chatbot_embeddings_table.rb @@ -4,8 +4,8 @@ class CreateChatbotEmbeddingsTable < ActiveRecord::Migration[7.0] def change create_table :chatbot_embeddings do |t| t.integer :post_id, null: false, index: { unique: true }, foreign_key: true - t.column :embedding, "real[]", null: false - t.timestamps + t.column :embedding, "real[]", null: false + t.timestamps end end end diff --git a/db/migrate/20231026010109_create_new_chatbot_post_embeddings_table.rb b/db/migrate/20231026010109_create_new_chatbot_post_embeddings_table.rb index 4cdc8c66..96e2c2a5 100644 --- a/db/migrate/20231026010109_create_new_chatbot_post_embeddings_table.rb +++ b/db/migrate/20231026010109_create_new_chatbot_post_embeddings_table.rb @@ -4,8 +4,8 @@ class CreateNewChatbotPostEmbeddingsTable < ActiveRecord::Migration[7.0] def change create_table :chatbot_post_embeddings do |t| t.integer :post_id, null: false, index: { unique: true }, foreign_key: true - t.column :embedding, "vector(1536)", null: false - t.timestamps + t.column :embedding, "vector(1536)", null: false + t.timestamps end end end diff --git a/db/migrate/20240412010101_create_chatbot_topic_title_embeddings_table.rb b/db/migrate/20240412010101_create_chatbot_topic_title_embeddings_table.rb new file mode 100644 index 00000000..440572bc --- /dev/null +++ b/db/migrate/20240412010101_create_chatbot_topic_title_embeddings_table.rb @@ -0,0 +1,12 @@ +# frozen_string_literal: true + +class CreateChatbotTopicTitleEmbeddingsTable < ActiveRecord::Migration[7.0] + def change + create_table :chatbot_topic_title_embeddings do |t| + t.integer :topic_id, null: false, index: { unique: true }, foreign_key: true + t.column :embedding, "vector(1536)", null: false + t.column :model, :string, default: nil + t.timestamps + end + end +end diff --git a/db/migrate/20240412010103_create_chatbot_topic_embeddings_bookmark_table.rb b/db/migrate/20240412010103_create_chatbot_topic_embeddings_bookmark_table.rb new file mode 100644 index 00000000..ee4b4831 --- /dev/null +++ b/db/migrate/20240412010103_create_chatbot_topic_embeddings_bookmark_table.rb @@ -0,0 +1,10 @@ +# frozen_string_literal: true + +class CreateChatbotTopicEmbeddingsBookmarkTable < ActiveRecord::Migration[7.0] + def change + create_table :chatbot_topic_embeddings_bookmark do |t| + t.integer :topic_id + t.timestamps + end + end +end diff --git a/db/migrate/20240412010105_create_cosine_pg_vector_chatbot_topic_title_embeddings_index.rb b/db/migrate/20240412010105_create_cosine_pg_vector_chatbot_topic_title_embeddings_index.rb new file mode 100644 index 00000000..c78623eb --- /dev/null +++ b/db/migrate/20240412010105_create_cosine_pg_vector_chatbot_topic_title_embeddings_index.rb @@ -0,0 +1,16 @@ +# frozen_string_literal: true + +class CreateCosinePgVectorChatbotTopicTitleEmbeddingsIndex < ActiveRecord::Migration[7.0] + def up + execute <<-SQL + CREATE INDEX pgv_hnsw_index_on_chatbot_topic_title_embeddings ON chatbot_topic_title_embeddings USING hnsw (embedding vector_cosine_ops) + WITH (m = 32, ef_construction = 64); + SQL + end + + def down + execute <<-SQL + DROP INDEX IF EXISTS pgv_hnsw_index_on_chatbot_topic_title_embeddings; + SQL + end +end diff --git a/lib/discourse_chatbot/embedding_completionist_process.rb b/lib/discourse_chatbot/embedding_completionist_process.rb index a75e0507..3df50778 100644 --- a/lib/discourse_chatbot/embedding_completionist_process.rb +++ b/lib/discourse_chatbot/embedding_completionist_process.rb @@ -6,9 +6,46 @@ module ::DiscourseChatbot class EmbeddingCompletionist def self.process + process_posts + process_topics + end + + def self.process_topics + bookmarked_topic_id = ::DiscourseChatbot::TopicEmbeddingsBookmark.first&.topic_id || ::Topic.first.id + + limit = (EMBEDDING_PROCESS_POSTS_CHUNK * (::Topic.count.fdiv(::Post.count))).ceil + + topic_range = ::Topic.where("id >= ?", bookmarked_topic_id).order(:id).limit(limit).pluck(:id) + + topic_range.each do |topic_id| + Jobs.enqueue(:chatbot_topic_title_embedding, id: topic_id) + + bookmarked_topic_id = ::Topic.where("id > ?", topic_id).order(:id).limit(1).pluck(:id)&.first + end + + bookmarked_topic_id = ::Topic.first.id if bookmarked_topic_id.nil? + + bookmark = ::DiscourseChatbot::TopicEmbeddingsBookmark.first + + if bookmark + bookmark.topic_id = bookmarked_topic_id + else + bookmark = ::DiscourseChatbot::TopicEmbeddingsBookmark.new(topic_id: bookmarked_topic_id) + end + + bookmark.save! + ::DiscourseChatbot.progress_debug_message <<~EOS + --------------------------------------------------------------------------------------------------------------- + Topic Embeddings Completion Bookmark is now at Topic: #{bookmark.topic_id} + ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + EOS + bookmark.topic_id + end + + def self.process_posts bookmarked_post_id = ::DiscourseChatbot::PostEmbeddingsBookmark.first&.post_id || ::Post.first.id - post_range = ::Post.where("id >= ?", bookmarked_post_id).order(:id).limit(EMBEDDING_PROCESS_CHUNK).pluck(:id) + post_range = ::Post.where("id >= ?", bookmarked_post_id).order(:id).limit(EMBEDDING_PROCESS_POSTS_CHUNK).pluck(:id) post_range.each do |post_id| Jobs.enqueue(:chatbot_post_embedding, id: post_id) diff --git a/lib/discourse_chatbot/embedding_process.rb b/lib/discourse_chatbot/embedding_process.rb new file mode 100644 index 00000000..510db206 --- /dev/null +++ b/lib/discourse_chatbot/embedding_process.rb @@ -0,0 +1,73 @@ +# frozen_string_literal: true +require "openai" + +module ::DiscourseChatbot + + class EmbeddingProcess + + def setup_api + ::OpenAI.configure do |config| + config.access_token = SiteSetting.chatbot_open_ai_token + end + if !SiteSetting.chatbot_open_ai_embeddings_model_custom_url.blank? + ::OpenAI.configure do |config| + config.uri_base = SiteSetting.chatbot_open_ai_embeddings_model_custom_url + end + end + if SiteSetting.chatbot_open_ai_model_custom_api_type == "azure" + ::OpenAI.configure do |config| + config.api_type = :azure + config.api_version = SiteSetting.chatbot_open_ai_model_custom_api_version + end + end + @model_name = SiteSetting.chatbot_open_ai_embeddings_model + @client = ::OpenAI::Client.new + end + + def upsert(id) + raise "Overwrite me!" + end + + def get_embedding_from_api(id) + raise "Overwrite me!" + end + + + def semantic_search(query) + raise "Overwrite me!" + end + + def in_scope(id) + raise "Overwrite me!" + end + + def is_valid(id) + raise "Overwrite me!" + end + + def in_categories_scope(id) + raise "Overwrite me!" + end + + def in_benchmark_user_scope(id) + raise "Overwrite me!" + end + + def benchmark_user + cache_key = "chatbot_benchmark_user" + benchmark_user = Discourse.cache.fetch(cache_key, expires_in: 1.hour) do + allowed_group_ids = [0, 10, 11, 12, 13, 14] # automated groups only + barred_group_ids = ::Group.where.not(id: allowed_group_ids).pluck(:id) # no custom groups + unsuitable_users = ::GroupUser.where(group_id: barred_group_ids).pluck(:user_id).uniq # don't choose someone with in a custom group + safe_users = ::User.where.not(id: unsuitable_users).distinct.pluck(:id) # exclude them and find a suitable vanilla, junior user + user = ::User.where(id: safe_users).where(trust_level: SiteSetting.chatbot_embeddings_benchmark_user_trust_level, active: true, admin: false, suspended_at: nil)&.last + if user.nil? + raise StandardError, "Chatbot: No benchmark user exists for Post embedding suitability check, please add a basic user" + end + user + end + + benchmark_user + end + end +end diff --git a/lib/discourse_chatbot/functions/forum_search_function.rb b/lib/discourse_chatbot/functions/forum_search_function.rb index c08e8843..07c1b6f8 100644 --- a/lib/discourse_chatbot/functions/forum_search_function.rb +++ b/lib/discourse_chatbot/functions/forum_search_function.rb @@ -27,32 +27,61 @@ def required def process(args) begin super(args) + top_topics = [] + top_topics_from_post_results = [] + top_topics_from_topic_title_results = [] + top_topic_title_results = [] query = args[parameters[0][:name]] number_of_posts = args[parameters[1][:name]].blank? ? 3 : args[parameters[1][:name]] number_of_posts = number_of_posts > SiteSetting.chatbot_forum_search_function_max_results ? SiteSetting.chatbot_forum_search_function_max_results : number_of_posts process_post_embedding = ::DiscourseChatbot::PostEmbeddingProcess.new results = process_post_embedding.semantic_search(query) - top_results = results[0..(number_of_posts - 1)] + if SiteSetting.chatbot_forum_search_function_include_topic_titles + process_topic_title_embedding = ::DiscourseChatbot::TopicTitleEmbeddingProcess.new + topic_title_results = process_topic_title_embedding.semantic_search(query) + top_topic_title_results = topic_title_results[0..(number_of_posts - 1)] + + # exclude if not in scope for embeddings (job hasn't caught up yet) + top_topic_title_results = top_topic_title_results.filter { |result| ::DiscourseChatbot::TopicTitleEmbeddingProcess.new.in_scope(result[:topic_id]) && ::DiscourseChatbot::TopicTitleEmbeddingProcess.new.is_valid(result[:topic_id])} + end + # exclude if not in scope for embeddings (job hasn't caught up yet) - top_results.select { |result| !::DiscourseChatbot::PostEmbeddingProcess.new.in_scope(result[:post_id]) || !::DiscourseChatbot::PostEmbeddingProcess.new.is_valid( result[:post_id])} + top_results = top_results.filter { |result| ::DiscourseChatbot::PostEmbeddingProcess.new.in_scope(result[:post_id]) && ::DiscourseChatbot::PostEmbeddingProcess.new.is_valid( result[:post_id])} + + if SiteSetting.chatbot_forum_search_function_results_content_type == "topic" || top_topic_title_results.length > 0 + top_topics_from_post_results = top_results.map { |result| ::Post.find(result[:post_id].to_i).topic_id }.uniq + top_topics_from_topic_title_results = top_topic_title_results.map { |result| result[:topic_id].to_i }.uniq + top_topics = (top_topics_from_post_results + top_topics_from_topic_title_results).uniq - if SiteSetting.chatbot_forum_search_function_results_content_type == "topic" - top_topics = top_results.map { |result| ::Post.find(result[:post_id].to_i).topic_id }.uniq response = I18n.t("chatbot.prompt.function.forum_search.answer.topic.summary", number_of_topics: top_topics.length) accepted_post_types = SiteSetting.chatbot_include_whispers_in_post_history ? ::DiscourseChatbot::POST_TYPES_INC_WHISPERS : ::DiscourseChatbot::POST_TYPES_REGULAR_ONLY top_topics.each_with_index do |topic_id, index| - top_result = top_results.find do |result| + top_post_result = {} + top_post_result = top_results.find do |result| post_topic_id = ::Post.find(result[:post_id].to_i).topic_id post_topic_id == topic_id end - score = top_result[:score] - original_post_number = ::Post.find(top_result[:post_id]).post_number + + top_topic_title_result = {} + top_topic_title_result = top_topic_title_results.find do |result| + topic_id == result[:topic_id] + end + + original_post_number = nil + + if !top_post_result.blank? + score = top_post_result[:score] + original_post_number = ::Post.find(top_post_result[:post_id]).post_number + else + score = top_topic_title_result[:score] + end + current_topic = ::Topic.find(topic_id) url = "https://#{Discourse.current_hostname}/t/slug/#{current_topic.id}" title = current_topic.title @@ -63,12 +92,13 @@ def process(args) when "all" Topic.find(topic_id).highest_post_number when "just_enough" - original_post_number + original_post_number || SiteSetting.chatbot_forum_search_function_results_topic_max_posts_count when "stretch_if_required" - original_post_number > SiteSetting.chatbot_forum_search_function_results_topic_max_posts_count ? original_post_number : SiteSetting.chatbot_forum_search_function_results_topic_max_posts_count + (original_post_number || 0) > SiteSetting.chatbot_forum_search_function_results_topic_max_posts_count ? original_post_number : SiteSetting.chatbot_forum_search_function_results_topic_max_posts_count else SiteSetting.chatbot_forum_search_function_results_topic_max_posts_count end + while post_number <= max_post_number do post = ::Post.find_by(topic_id: topic_id, post_number: post_number ) break if post.nil? @@ -90,7 +120,8 @@ def process(args) end end response - rescue + rescue StandardError => e + Rails.logger.error("Chatbot: Error occurred while attempting to retrieve Forum Search results for query '#{query}': #{e.message}") I18n.t("chatbot.prompt.function.forum_search.error", query: args[parameters[0][:name]]) end end diff --git a/lib/discourse_chatbot/post_embedding_process.rb b/lib/discourse_chatbot/post/post_embedding_process.rb similarity index 75% rename from lib/discourse_chatbot/post_embedding_process.rb rename to lib/discourse_chatbot/post/post_embedding_process.rb index 9e0a8a90..6632ae0e 100644 --- a/lib/discourse_chatbot/post_embedding_process.rb +++ b/lib/discourse_chatbot/post/post_embedding_process.rb @@ -3,26 +3,7 @@ module ::DiscourseChatbot - class PostEmbeddingProcess - - def setup_api - ::OpenAI.configure do |config| - config.access_token = SiteSetting.chatbot_open_ai_token - end - if !SiteSetting.chatbot_open_ai_embeddings_model_custom_url.blank? - ::OpenAI.configure do |config| - config.uri_base = SiteSetting.chatbot_open_ai_embeddings_model_custom_url - end - end - if SiteSetting.chatbot_open_ai_model_custom_api_type == "azure" - ::OpenAI.configure do |config| - config.api_type = :azure - config.api_version = SiteSetting.chatbot_open_ai_model_custom_api_version - end - end - @model_name = SiteSetting.chatbot_open_ai_embeddings_model - @client = ::OpenAI::Client.new - end + class PostEmbeddingProcess < EmbeddingProcess def upsert(post_id) if in_scope(post_id) @@ -162,22 +143,5 @@ def in_benchmark_user_scope(post_id) return false if topic.archetype == ::Archetype.private_message Guardian.new(benchmark_user).can_see?(post) end - - def benchmark_user - cache_key = "chatbot_benchmark_user" - benchmark_user = Discourse.cache.fetch(cache_key, expires_in: 1.hour) do - allowed_group_ids = [0, 10, 11, 12, 13, 14] # automated groups only - barred_group_ids = ::Group.where.not(id: allowed_group_ids).pluck(:id) # no custom groups - unsuitable_users = ::GroupUser.where(group_id: barred_group_ids).pluck(:user_id).uniq # don't choose someone with in a custom group - safe_users = ::User.where.not(id: unsuitable_users).distinct.pluck(:id) # exclude them and find a suitable vanilla, junior user - user = ::User.where(id: safe_users).where(trust_level: SiteSetting.chatbot_embeddings_benchmark_user_trust_level, active: true, admin: false, suspended_at: nil)&.last - if user.nil? - raise StandardError, "Chatbot: No benchmark user exists for Post embedding suitability check, please add a basic user" - end - user - end - - benchmark_user - end end end diff --git a/lib/discourse_chatbot/topic/topic_title_embedding_process.rb b/lib/discourse_chatbot/topic/topic_title_embedding_process.rb new file mode 100644 index 00000000..db458283 --- /dev/null +++ b/lib/discourse_chatbot/topic/topic_title_embedding_process.rb @@ -0,0 +1,141 @@ +# frozen_string_literal: true +require "openai" + +module ::DiscourseChatbot + + class TopicTitleEmbeddingProcess < EmbeddingProcess + + def upsert(topic_id) + if in_scope(topic_id) + if !is_valid(topic_id) + + embedding_vector = get_embedding_from_api(topic_id) + + ::DiscourseChatbot::TopicTitleEmbedding.upsert({ topic_id: topic_id, model: SiteSetting.chatbot_open_ai_embeddings_model, embedding: "#{embedding_vector}" }, on_duplicate: :update, unique_by: :topic_id) + + ::DiscourseChatbot.progress_debug_message <<~EOS + --------------------------------------------------------------------------------------------------------------- + Topic Title Embeddings: I found an embedding that needed populating or updating, id: #{topic_id} + ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + EOS + end + else + topic_title_embedding = ::DiscourseChatbot::TopicTitleEmbedding.find_by(topic_id: topic_id) + if topic_title_embedding + ::DiscourseChatbot.progress_debug_message <<~EOS + --------------------------------------------------------------------------------------------------------------- + Topic Title Embeddings: I found a Topic that was out of scope for embeddings, so deleted the embedding, id: #{topic_id} + ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + EOS + topic_title_embedding.delete + end + end + end + + def get_embedding_from_api(topic_id) + begin + self.setup_api + + topic = ::Topic.find_by(id: topic_id) + response = @client.embeddings( + parameters: { + model: @model_name, + input: topic.title + } + ) + + if response.dig("error") + error_text = response.dig("error", "message") + raise StandardError, error_text + end + rescue StandardError => e + Rails.logger.error("Chatbot: Error occurred while attempting to retrieve Embedding for topic id '#{topic_id}': #{e.message}") + raise e + end + + embedding_vector = response.dig("data", 0, "embedding") + end + + + def semantic_search(query) + self.setup_api + + response = @client.embeddings( + parameters: { + model: @model_name, + input: query[0..SiteSetting.chatbot_open_ai_embeddings_char_limit] + } + ) + + query_vector = response.dig("data", 0, "embedding") + + begin + threshold = SiteSetting.chatbot_forum_search_function_similarity_threshold + results = + DB.query(<<~SQL, query_embedding: query_vector, threshold: threshold, limit: 100) + SELECT + topic_id, + t.user_id, + embedding <=> '[:query_embedding]' as cosine_distance + FROM + chatbot_topic_title_embeddings + INNER JOIN + topics t + ON + topic_id = t.id + WHERE + (1 - (embedding <=> '[:query_embedding]')) > :threshold + ORDER BY + embedding <=> '[:query_embedding]' + LIMIT :limit + SQL + + high_ranked_users = [] + + SiteSetting.chatbot_forum_search_function_reranking_group_promotion_map.each do |g| + high_ranked_users = high_ranked_users | GroupUser.where(group_id: g).pluck(:user_id) + end + + reranked_results = results.filter {|r| high_ranked_users.include?(r.user_id)} + results.filter {|r| !high_ranked_users.include?(r.user_id)}.first(20) + + rescue PG::Error => e + Rails.logger.error( + "Error #{e} querying embeddings for search #{query}", + ) + raise MissingEmbeddingError + end + reranked_results.map {|t| { topic_id: t.topic_id, user_id: t.user_id, score: (1 - t.cosine_distance) } } + end + + def in_scope(topic_id) + return false if !::Topic.find_by(id: topic_id).present? + if SiteSetting.chatbot_embeddings_strategy == "categories" + return false if !in_categories_scope(topic_id) + else + return false if !in_benchmark_user_scope(topic_id) + end + true + end + + def is_valid(topic_id) + embedding_record = ::DiscourseChatbot::TopicTitleEmbedding.find_by(topic_id: topic_id) + return false if !embedding_record.present? + return false if embedding_record.model != SiteSetting.chatbot_open_ai_embeddings_model + true + end + + def in_categories_scope(topic_id) + topic = ::Topic.find_by(id: topic_id) + return false if topic.nil? + return false if topic.archetype == ::Archetype.private_message + SiteSetting.chatbot_embeddings_categories.split("|").include?(topic.category_id.to_s) + end + + def in_benchmark_user_scope(topic_id) + topic = ::Topic.find_by(id: topic_id) + return false if topic.nil? + return false if topic.archetype == ::Archetype.private_message + Guardian.new(benchmark_user).can_see?(topic) + end + end +end diff --git a/lib/tasks/chatbot.rake b/lib/tasks/chatbot.rake index 60f7352f..287e3af7 100644 --- a/lib/tasks/chatbot.rake +++ b/lib/tasks/chatbot.rake @@ -4,47 +4,13 @@ task "chatbot:refresh_embeddings", %i[missing_only delay] => :environment do |_, ENV["RAILS_DB"] ? refresh_embeddings(args) : refresh_embeddings_all_sites(args) end -desc "Refresh embeddings for all posts matching string/regex and optionally delay the loop" -task "chatbot:refresh_embeddings_match", %i[pattern type delay] => [:environment] do |_, args| - args.with_defaults(type: "string") - pattern = args[:pattern] - type = args[:type]&.downcase - delay = args[:delay]&.to_i - - if !pattern - puts "ERROR: Expecting rake chatbot:refresh_embeddings_match[pattern,type,delay]" - exit 1 - elsif delay && delay < 1 - puts "ERROR: delay parameter should be an integer and greater than 0" - exit 1 - elsif type != "string" && type != "regex" - puts "ERROR: Expecting rake chatbot:refresh_embeddings_match[pattern,type] where type is string or regex" - exit 1 - end - - search = Post.raw_match(pattern, type) - - refreshed = 0 - total = search.count - - process_post_embedding = ::DiscourseChatbot::PostEmbeddingProcess.new - - search.find_each do |post| - process_post_embedding.upsert(post.id) - print_status(refreshed += 1, total) - sleep(delay) if delay - end - - puts "", "#{refreshed} posts done!", "" -end - def refresh_embeddings_all_sites(args) RailsMultisite::ConnectionManagement.each_connection { |db| refresh_embeddings(args) } end def refresh_embeddings(args) puts "-" * 50 - puts "Refreshing embeddings for posts for '#{RailsMultisite::ConnectionManagement.current_db}'" + puts "Refreshing embeddings for posts and topic titles for '#{RailsMultisite::ConnectionManagement.current_db}'" puts "-" * 50 missing_only = args[:missing_only]&.to_i @@ -82,4 +48,28 @@ def refresh_embeddings(args) end puts "", "#{refreshed} posts done!", "-" * 50 + + begin + total = Topic.count + refreshed = 0 + batch = 1000 + + process_topic_title_embedding = ::DiscourseChatbot::TopicTitleEmbeddingProcess.new + + (0..(total - 1).abs).step(batch) do |i| + Topic + .order(id: :desc) + .offset(i) + .limit(batch) + .each do |topic| + if !missing_only.to_i.zero? && ::DiscourseChatbot::TopicTitleEmbedding.find_by(topic_id: topic.id).nil? || missing_only.to_i.zero? + process_post_embedding.upsert(topic.id) + sleep(delay) if delay + end + print_status(refreshed += 1, total) + end + end + end + + puts "", "#{refreshed} topic titles done!", "-" * 50 end diff --git a/plugin.rb b/plugin.rb index 32871d75..223e906d 100644 --- a/plugin.rb +++ b/plugin.rb @@ -1,7 +1,7 @@ # frozen_string_literal: true # name: discourse-chatbot # about: a plugin that allows you to have a conversation with a configurable chatbot in Discourse Chat, Topics and Private Messages -# version: 0.9.12 +# version: 0.9.13 # authors: merefield # url: https://github.com/merefield/discourse-chatbot @@ -31,7 +31,7 @@ module ::DiscourseChatbot MEDIUM_TRUST_LEVEL = 2 LOW_TRUST_LEVEL = 1 - EMBEDDING_PROCESS_CHUNK = 300 + EMBEDDING_PROCESS_POSTS_CHUNK = 300 def progress_debug_message(message) puts "Chatbot: #{message}" if SiteSetting.chatbot_enable_verbose_console_logging @@ -67,7 +67,11 @@ def progress_debug_message(message) ../lib/discourse_chatbot/event_evaluation.rb ../app/models/discourse_chatbot/post_embedding.rb ../app/models/discourse_chatbot/post_embeddings_bookmark.rb - ../lib/discourse_chatbot/post_embedding_process.rb + ../app/models/discourse_chatbot/topic_title_embedding.rb + ../app/models/discourse_chatbot/topic_embeddings_bookmark.rb + ../lib/discourse_chatbot/embedding_process.rb + ../lib/discourse_chatbot/post/post_embedding_process.rb + ../lib/discourse_chatbot/topic/topic_title_embedding_process.rb ../lib/discourse_chatbot/embedding_completionist_process.rb ../lib/discourse_chatbot/message/message_evaluation.rb ../lib/discourse_chatbot/post/post_evaluation.rb @@ -104,6 +108,9 @@ def progress_debug_message(message) ../app/controllers/discourse_chatbot/chatbot_controller.rb ../app/jobs/regular/chatbot_reply.rb ../app/jobs/regular/chatbot_post_embedding.rb + ../app/jobs/regular/chatbot_post_embedding_delete.rb + ../app/jobs/regular/chatbot_topic_title_embedding.rb + ../app/jobs/regular/chatbot_topic_title_embedding_delete.rb ../app/jobs/scheduled/chatbot_quota_reset.rb ../app/jobs/scheduled/chatbot_embeddings_set_completer.rb ).each do |path| @@ -161,12 +168,44 @@ class ::Chat::UpdateUserLastRead end end + DiscourseEvent.on(:topic_destroyed) do |*params| + topic, opts, user = params + + if SiteSetting.chatbot_enabled + job_class = ::Jobs::ChatbotTopicTitleEmbeddingDelete + job_class.perform_async(topic.as_json) + end + end + + DiscourseEvent.on(:topic_recovered) do |*params| + topic, opts = params + + if SiteSetting.chatbot_enabled + job_class = ::Jobs::ChatbotTopicTitleEmbedding + job_class.perform_async(topic.as_json) + end + end + + DiscourseEvent.on(:topic_created) do |*params| + topic, opts = params + + if SiteSetting.chatbot_enabled + job_class = ::Jobs::ChatbotTopicTitleEmbedding + job_class.perform_async(topic.as_json) + end + end + DiscourseEvent.on(:post_edited) do |*params| - post, opts = params + post, topic_changed, opts = params if SiteSetting.chatbot_enabled && post.post_type == 1 job_class = ::Jobs::ChatbotPostEmbedding job_class.perform_async(post.as_json) + + if post.is_first_post? && topic_changed + job_class = ::Jobs::ChatbotTopicTitleEmbedding + job_class.perform_async(post.topic.as_json) + end end end diff --git a/spec/lib/embedding_completionist_process_spec.rb b/spec/lib/embedding_completionist_process_spec.rb index b37de07b..39e00635 100644 --- a/spec/lib/embedding_completionist_process_spec.rb +++ b/spec/lib/embedding_completionist_process_spec.rb @@ -9,10 +9,10 @@ let(:post_3) { Fabricate(:post) } let(:post_4) { Fabricate(:post) } let(:post_5) { Fabricate(:post) } - @original_constant = DiscourseChatbot::EMBEDDING_PROCESS_CHUNK + @original_constant = DiscourseChatbot::EMBEDDING_PROCESS_POSTS_CHUNK after(:each) do - DiscourseChatbot.const_set(:EMBEDDING_PROCESS_CHUNK, @original_constant) + DiscourseChatbot.const_set(:EMBEDDING_PROCESS_POSTS_CHUNK, @original_constant) end it 'should process a chunk each time its called and reset to start once it gets to end' do @@ -23,14 +23,14 @@ expect(post_4).to be_present expect(post_5).to be_present - DiscourseChatbot.const_set(:EMBEDDING_PROCESS_CHUNK, 3) + DiscourseChatbot.const_set(:EMBEDDING_PROCESS_POSTS_CHUNK, 3) DiscourseChatbot::PostEmbeddingsBookmark.new(post_id: post_1.id).save! - expect(described_class.process).to eq(post_4.id) + expect(described_class.process_posts).to eq(post_4.id) bookmark = DiscourseChatbot::PostEmbeddingsBookmark.first expect(bookmark).to be_present expect(bookmark.post_id).to eq(post_4.id) - expect(described_class.process).to eq(post_1.id) - expect(described_class.process).to eq(post_4.id) + expect(described_class.process_posts).to eq(post_1.id) + expect(described_class.process_posts).to eq(post_4.id) end end end