Skip to content

Commit

Permalink
Merge pull request #86 from merefield/topic_title_embedding
Browse files Browse the repository at this point in the history
FEATURE: topic title embeddings and semantic title search
  • Loading branch information
merefield authored Apr 13, 2024
2 parents 1321a28 + a67244f commit 8bbff74
Show file tree
Hide file tree
Showing 19 changed files with 470 additions and 97 deletions.
20 changes: 20 additions & 0 deletions app/jobs/regular/chatbot_topic_title_embedding.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# frozen_string_literal: true

# Job is triggered on an update to a Post.
class ::Jobs::ChatbotTopicTitleEmbedding < Jobs::Base
sidekiq_options retry: 5, dead: false, queue: 'low'

def execute(opts)
begin
topic_id = opts[:id]

::DiscourseChatbot.progress_debug_message("100. Creating/updating a Topic Title Embedding for Topic id: #{topic_id}")

process_topic_title_embedding = ::DiscourseChatbot::TopicTitleEmbeddingProcess.new

process_topic_title_embedding.upsert(topic_id)
rescue => e
Rails.logger.error("Chatbot: Topic Title Embedding: There was a problem, but will retry til limit: #{e}")
end
end
end
18 changes: 18 additions & 0 deletions app/jobs/regular/chatbot_topic_title_embedding_delete.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# frozen_string_literal: true

# Job is triggered on a Topic destruction.
class ::Jobs::ChatbotTopicTitleEmbeddingDelete < Jobs::Base
sidekiq_options retry: false

def execute(opts)
begin
topic_id = opts[:id]

::DiscourseChatbot.progress_debug_message("101. Deleting a Topic Title Embedding for Topic id: #{topic_id}")

::DiscourseChatbot::TopicTitleEmbedding.find_by(topic_id: topic_id).destroy!
rescue => e
Rails.logger.error("Chatbot: Topic Title Embedding: There was a problem, but will retry til limit: #{e}")
end
end
end
9 changes: 9 additions & 0 deletions app/models/discourse_chatbot/topic_embeddings_bookmark.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# frozen_string_literal: true

module ::DiscourseChatbot
class TopicEmbeddingsBookmark < ActiveRecord::Base
self.table_name = 'chatbot_topic_embeddings_bookmark'

validates :topic_id, presence: true
end
end
9 changes: 9 additions & 0 deletions app/models/discourse_chatbot/topic_title_embedding.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# frozen_string_literal: true

module ::DiscourseChatbot
class TopicTitleEmbedding < ActiveRecord::Base
self.table_name = 'chatbot_topic_title_embeddings'

validates :topic_id, presence: true, uniqueness: true
end
end
1 change: 1 addition & 0 deletions config/locales/server.en.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ en:
chatbot_forum_search_function_max_results: "Maximum number of Posts shown to bot from a forum search"
chatbot_forum_search_function_similarity_threshold: "Fraction of 1. The point at which a Post is included in the results is defined by how similar it is to the query with 1 being identical and zero being nothing like the query"
chatbot_forum_search_function_reranking_group_promotion: "Posts authored by these Groups are promoted in results set so their Posts are more likely to be seen and taken into account by the bot"
chatbot_forum_search_function_include_topic_titles: "Include Topic titles in the search results"
chatbot_forum_search_function_results_content_type: "The scope of content to be returned in the search results. Choose 'posts' for just ranking Posts, 'topics' for the entire Topics that contain those ranked Posts"
chatbot_forum_search_function_results_topic_max_posts_count_strategy: "The strategy used to determine the maximum number of Posts to be returned in the search results if content_type is 'topics'. Choose 'all' for all Posts, 'just_enough' to limit the Posts to only those up to including the ranked Post, 'stretch_if_required' to include all Posts up to the ranked Post regardless of the max setting, 'exact' for exactly the number of Posts specified in the max setting"
chatbot_forum_search_function_results_topic_max_posts_count: "The maximum number of Posts to be returned in the search results if content_type is 'topics'"
Expand Down
3 changes: 3 additions & 0 deletions config/settings.yml
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,9 @@ plugins:
default: ""
allow_any: false
refresh: true
chatbot_forum_search_function_include_topic_titles:
client: false
default: false
chatbot_forum_search_function_results_content_type:
client: false
default: "post"
Expand Down
4 changes: 2 additions & 2 deletions db/migrate/20230820010103_create_chatbot_embeddings_table.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ class CreateChatbotEmbeddingsTable < ActiveRecord::Migration[7.0]
def change
create_table :chatbot_embeddings do |t|
t.integer :post_id, null: false, index: { unique: true }, foreign_key: true
t.column :embedding, "real[]", null: false
t.timestamps
t.column :embedding, "real[]", null: false
t.timestamps
end
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ class CreateNewChatbotPostEmbeddingsTable < ActiveRecord::Migration[7.0]
def change
create_table :chatbot_post_embeddings do |t|
t.integer :post_id, null: false, index: { unique: true }, foreign_key: true
t.column :embedding, "vector(1536)", null: false
t.timestamps
t.column :embedding, "vector(1536)", null: false
t.timestamps
end
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# frozen_string_literal: true

class CreateChatbotTopicTitleEmbeddingsTable < ActiveRecord::Migration[7.0]
def change
create_table :chatbot_topic_title_embeddings do |t|
t.integer :topic_id, null: false, index: { unique: true }, foreign_key: true
t.column :embedding, "vector(1536)", null: false
t.column :model, :string, default: nil
t.timestamps
end
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# frozen_string_literal: true

class CreateChatbotTopicEmbeddingsBookmarkTable < ActiveRecord::Migration[7.0]
def change
create_table :chatbot_topic_embeddings_bookmark do |t|
t.integer :topic_id
t.timestamps
end
end
end
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# frozen_string_literal: true

class CreateCosinePgVectorChatbotTopicTitleEmbeddingsIndex < ActiveRecord::Migration[7.0]
def up
execute <<-SQL
CREATE INDEX pgv_hnsw_index_on_chatbot_topic_title_embeddings ON chatbot_topic_title_embeddings USING hnsw (embedding vector_cosine_ops)
WITH (m = 32, ef_construction = 64);
SQL
end

def down
execute <<-SQL
DROP INDEX IF EXISTS pgv_hnsw_index_on_chatbot_topic_title_embeddings;
SQL
end
end
39 changes: 38 additions & 1 deletion lib/discourse_chatbot/embedding_completionist_process.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,46 @@ module ::DiscourseChatbot
class EmbeddingCompletionist

def self.process
process_posts
process_topics
end

def self.process_topics
bookmarked_topic_id = ::DiscourseChatbot::TopicEmbeddingsBookmark.first&.topic_id || ::Topic.first.id

limit = (EMBEDDING_PROCESS_POSTS_CHUNK * (::Topic.count.fdiv(::Post.count))).ceil

topic_range = ::Topic.where("id >= ?", bookmarked_topic_id).order(:id).limit(limit).pluck(:id)

topic_range.each do |topic_id|
Jobs.enqueue(:chatbot_topic_title_embedding, id: topic_id)

bookmarked_topic_id = ::Topic.where("id > ?", topic_id).order(:id).limit(1).pluck(:id)&.first
end

bookmarked_topic_id = ::Topic.first.id if bookmarked_topic_id.nil?

bookmark = ::DiscourseChatbot::TopicEmbeddingsBookmark.first

if bookmark
bookmark.topic_id = bookmarked_topic_id
else
bookmark = ::DiscourseChatbot::TopicEmbeddingsBookmark.new(topic_id: bookmarked_topic_id)
end

bookmark.save!
::DiscourseChatbot.progress_debug_message <<~EOS
---------------------------------------------------------------------------------------------------------------
Topic Embeddings Completion Bookmark is now at Topic: #{bookmark.topic_id}
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
EOS
bookmark.topic_id
end

def self.process_posts
bookmarked_post_id = ::DiscourseChatbot::PostEmbeddingsBookmark.first&.post_id || ::Post.first.id

post_range = ::Post.where("id >= ?", bookmarked_post_id).order(:id).limit(EMBEDDING_PROCESS_CHUNK).pluck(:id)
post_range = ::Post.where("id >= ?", bookmarked_post_id).order(:id).limit(EMBEDDING_PROCESS_POSTS_CHUNK).pluck(:id)

post_range.each do |post_id|
Jobs.enqueue(:chatbot_post_embedding, id: post_id)
Expand Down
73 changes: 73 additions & 0 deletions lib/discourse_chatbot/embedding_process.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# frozen_string_literal: true
require "openai"

module ::DiscourseChatbot

class EmbeddingProcess

def setup_api
::OpenAI.configure do |config|
config.access_token = SiteSetting.chatbot_open_ai_token
end
if !SiteSetting.chatbot_open_ai_embeddings_model_custom_url.blank?
::OpenAI.configure do |config|
config.uri_base = SiteSetting.chatbot_open_ai_embeddings_model_custom_url
end
end
if SiteSetting.chatbot_open_ai_model_custom_api_type == "azure"
::OpenAI.configure do |config|
config.api_type = :azure
config.api_version = SiteSetting.chatbot_open_ai_model_custom_api_version
end
end
@model_name = SiteSetting.chatbot_open_ai_embeddings_model
@client = ::OpenAI::Client.new
end

def upsert(id)
raise "Overwrite me!"
end

def get_embedding_from_api(id)
raise "Overwrite me!"
end


def semantic_search(query)
raise "Overwrite me!"
end

def in_scope(id)
raise "Overwrite me!"
end

def is_valid(id)
raise "Overwrite me!"
end

def in_categories_scope(id)
raise "Overwrite me!"
end

def in_benchmark_user_scope(id)
raise "Overwrite me!"
end

def benchmark_user
cache_key = "chatbot_benchmark_user"
benchmark_user = Discourse.cache.fetch(cache_key, expires_in: 1.hour) do
allowed_group_ids = [0, 10, 11, 12, 13, 14] # automated groups only
barred_group_ids = ::Group.where.not(id: allowed_group_ids).pluck(:id) # no custom groups
unsuitable_users = ::GroupUser.where(group_id: barred_group_ids).pluck(:user_id).uniq # don't choose someone with in a custom group
safe_users = ::User.where.not(id: unsuitable_users).distinct.pluck(:id) # exclude them and find a suitable vanilla, junior user
user = ::User.where(id: safe_users).where(trust_level: SiteSetting.chatbot_embeddings_benchmark_user_trust_level, active: true, admin: false, suspended_at: nil)&.last
if user.nil?
raise StandardError, "Chatbot: No benchmark user exists for Post embedding suitability check, please add a basic user"
end
user
end

benchmark_user
end
end
end
51 changes: 41 additions & 10 deletions lib/discourse_chatbot/functions/forum_search_function.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,32 +27,61 @@ def required
def process(args)
begin
super(args)

top_topics = []
top_topics_from_post_results = []
top_topics_from_topic_title_results = []
top_topic_title_results = []
query = args[parameters[0][:name]]
number_of_posts = args[parameters[1][:name]].blank? ? 3 : args[parameters[1][:name]]
number_of_posts = number_of_posts > SiteSetting.chatbot_forum_search_function_max_results ? SiteSetting.chatbot_forum_search_function_max_results : number_of_posts

process_post_embedding = ::DiscourseChatbot::PostEmbeddingProcess.new
results = process_post_embedding.semantic_search(query)

top_results = results[0..(number_of_posts - 1)]

if SiteSetting.chatbot_forum_search_function_include_topic_titles
process_topic_title_embedding = ::DiscourseChatbot::TopicTitleEmbeddingProcess.new
topic_title_results = process_topic_title_embedding.semantic_search(query)
top_topic_title_results = topic_title_results[0..(number_of_posts - 1)]

# exclude if not in scope for embeddings (job hasn't caught up yet)
top_topic_title_results = top_topic_title_results.filter { |result| ::DiscourseChatbot::TopicTitleEmbeddingProcess.new.in_scope(result[:topic_id]) && ::DiscourseChatbot::TopicTitleEmbeddingProcess.new.is_valid(result[:topic_id])}
end

# exclude if not in scope for embeddings (job hasn't caught up yet)
top_results.select { |result| !::DiscourseChatbot::PostEmbeddingProcess.new.in_scope(result[:post_id]) || !::DiscourseChatbot::PostEmbeddingProcess.new.is_valid( result[:post_id])}
top_results = top_results.filter { |result| ::DiscourseChatbot::PostEmbeddingProcess.new.in_scope(result[:post_id]) && ::DiscourseChatbot::PostEmbeddingProcess.new.is_valid( result[:post_id])}

if SiteSetting.chatbot_forum_search_function_results_content_type == "topic" || top_topic_title_results.length > 0
top_topics_from_post_results = top_results.map { |result| ::Post.find(result[:post_id].to_i).topic_id }.uniq
top_topics_from_topic_title_results = top_topic_title_results.map { |result| result[:topic_id].to_i }.uniq
top_topics = (top_topics_from_post_results + top_topics_from_topic_title_results).uniq

if SiteSetting.chatbot_forum_search_function_results_content_type == "topic"
top_topics = top_results.map { |result| ::Post.find(result[:post_id].to_i).topic_id }.uniq
response = I18n.t("chatbot.prompt.function.forum_search.answer.topic.summary", number_of_topics: top_topics.length)

accepted_post_types = SiteSetting.chatbot_include_whispers_in_post_history ? ::DiscourseChatbot::POST_TYPES_INC_WHISPERS : ::DiscourseChatbot::POST_TYPES_REGULAR_ONLY

top_topics.each_with_index do |topic_id, index|
top_result = top_results.find do |result|
top_post_result = {}
top_post_result = top_results.find do |result|
post_topic_id = ::Post.find(result[:post_id].to_i).topic_id
post_topic_id == topic_id
end
score = top_result[:score]
original_post_number = ::Post.find(top_result[:post_id]).post_number

top_topic_title_result = {}
top_topic_title_result = top_topic_title_results.find do |result|
topic_id == result[:topic_id]
end

original_post_number = nil

if !top_post_result.blank?
score = top_post_result[:score]
original_post_number = ::Post.find(top_post_result[:post_id]).post_number
else
score = top_topic_title_result[:score]
end

current_topic = ::Topic.find(topic_id)
url = "https://#{Discourse.current_hostname}/t/slug/#{current_topic.id}"
title = current_topic.title
Expand All @@ -63,12 +92,13 @@ def process(args)
when "all"
Topic.find(topic_id).highest_post_number
when "just_enough"
original_post_number
original_post_number || SiteSetting.chatbot_forum_search_function_results_topic_max_posts_count
when "stretch_if_required"
original_post_number > SiteSetting.chatbot_forum_search_function_results_topic_max_posts_count ? original_post_number : SiteSetting.chatbot_forum_search_function_results_topic_max_posts_count
(original_post_number || 0) > SiteSetting.chatbot_forum_search_function_results_topic_max_posts_count ? original_post_number : SiteSetting.chatbot_forum_search_function_results_topic_max_posts_count
else
SiteSetting.chatbot_forum_search_function_results_topic_max_posts_count
end

while post_number <= max_post_number do
post = ::Post.find_by(topic_id: topic_id, post_number: post_number )
break if post.nil?
Expand All @@ -90,7 +120,8 @@ def process(args)
end
end
response
rescue
rescue StandardError => e
Rails.logger.error("Chatbot: Error occurred while attempting to retrieve Forum Search results for query '#{query}': #{e.message}")
I18n.t("chatbot.prompt.function.forum_search.error", query: args[parameters[0][:name]])
end
end
Expand Down
Loading

0 comments on commit 8bbff74

Please sign in to comment.