Skip to content

Commit f165c67

Browse files
authored
feat: Add sqlite-vss for vector search similariy capabilities
* feat: Add sqlite-vss to add simioary search to sqlite * feat: Integrate with langchain for sqlite-vss implementation * feat: VSS now fully working * fix: remove unsused file * fix: Adjust unit test for similarity search
1 parent f64f9fe commit f165c67

12 files changed

+567
-540
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
contentmap.db
2+
.DS_Store
23
/scratch
34

45
Byte-compiled / optimized / DLL files

Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,4 @@ RUN python3 -c 'from sentence_transformers import SentenceTransformer; embedder
1818

1919
ADD . /app
2020

21-
CMD ["pytest", "./tests"]
21+
CMD ["pytest", "./tests"]

contentmap/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from contentmap.core import ContentMapCreator
2+
from contentmap.sitemap import SitemapToContentDatabase

contentmap/core.py

+26-12
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,44 @@
1-
from typing import List, Dict, TypedDict
1+
from typing import List, Dict
22
from datetime import datetime
33
import sqlite3
4-
import importlib.metadata
4+
from contentmap.vss import ContentMapVSS
55

66

77
class ContentMapCreator:
88

99
def __init__(
1010
self,
1111
contents: List[Dict[str, str]],
12-
database: str = "contentmap.db"
12+
database: str = "contentmap.db",
13+
include_vss: bool = False
1314
):
1415
self.contents = contents
15-
self.db = sqlite3.connect(database)
16-
self.db.row_factory = sqlite3.Row
17-
self.cursor = self.db.cursor()
16+
self.include_vss = include_vss
17+
self.connection = sqlite3.connect(database)
18+
self.connection.row_factory = sqlite3.Row
19+
20+
if self.include_vss:
21+
import sqlite_vss
22+
self.connection.enable_load_extension(True)
23+
sqlite_vss.load(self.connection)
24+
self.connection.enable_load_extension(False)
25+
26+
self.cursor = self.connection.cursor()
1827

1928
def init_db(self):
2029
self.cursor.execute("CREATE TABLE IF NOT EXISTS content (url, content)")
2130
self.cursor.execute("CREATE TABLE IF NOT EXISTS config (cat, value)")
22-
self.db.commit()
31+
self.connection.commit()
2332

2433
def add_config(self):
2534
data = [
26-
{"Version:": "1"},
2735
{"Generated with:": "Contentmap lib"},
2836
{"Date:": datetime.now().strftime("%Y-%m-%d %H:%M:%S")},
29-
{"Embeddings:": "mistral-embed"},
30-
{"FTSE:": None},
37+
{"Embeddings:": "all-MiniLM-L6-v2"}
3138
]
3239
data = [{"cat": k, "value": v} for row in data for k, v in row.items()]
3340
self.cursor.executemany("INSERT INTO config VALUES (:cat, :value)", data)
34-
self.db.commit()
41+
self.connection.commit()
3542

3643
def build(self):
3744
self.init_db()
@@ -40,4 +47,11 @@ def build(self):
4047
"INSERT INTO content VALUES (:url, :content)",
4148
self.contents
4249
)
43-
self.db.commit()
50+
self.connection.commit()
51+
52+
if self.include_vss:
53+
self.add_vss()
54+
55+
def add_vss(self):
56+
cm_vss = ContentMapVSS(connection=self.connection)
57+
cm_vss.load()

contentmap/ftse.py

Whitespace-only changes.

contentmap/sitemap.py

+5-3
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,21 @@
1212

1313
class SitemapToContentDatabase:
1414

15-
def __init__(self, sitemap_url, seconds_timeout=10, concurrency=None):
15+
def __init__(self, sitemap_url, seconds_timeout=10, concurrency=None,
16+
include_vss=False):
1617
self.sitemap_url = sitemap_url
1718
self.semaphore = asyncio.Semaphore(concurrency) if concurrency is not None else None
1819
self.timeout = aiohttp.ClientTimeout(
1920
sock_connect=seconds_timeout,
2021
sock_read=seconds_timeout
2122
)
23+
self.include_vss = include_vss
2224

23-
def load(self):
25+
def build(self):
2426
urls = self.get_urls()
2527
loop = asyncio.get_event_loop()
2628
contents = loop.run_until_complete(self.get_contents(urls))
27-
cm = ContentMapCreator(contents)
29+
cm = ContentMapCreator(contents, include_vss=self.include_vss)
2830
cm.build()
2931

3032
def get_urls(self):

contentmap/vss.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,7 @@
55
import sqlite3
66
from typing import Optional
77

8-
import sqlite_vss
9-
from langchain.text_splitter import RecursiveCharacterTextSplitter
108
from langchain.text_splitter import CharacterTextSplitter
11-
from langchain_community.document_loaders import TextLoader
129
from langchain_community.embeddings.sentence_transformer import (
1310
SentenceTransformerEmbeddings,
1411
)
@@ -59,8 +56,10 @@ def prepare_texts_and_metadatas(self):
5956
rows = result.fetchall()
6057

6158
# based on Anyscale analysis (https://t.ly/yjgxQ), it looks like the
62-
# sweet spot is 700 chunk size and 50 chunk overlap
63-
text_splitter = CharacterTextSplitter(chunk_size=700, chunk_overlap=50)
59+
# sweet spot is 700 chunk size and 50 chunk overlap.
60+
text_splitter = CharacterTextSplitter(
61+
chunk_size=700, chunk_overlap=50, separator="."
62+
)
6463

6564
texts = []
6665
metadatas = []
@@ -73,4 +72,9 @@ def prepare_texts_and_metadatas(self):
7372
return texts, metadatas
7473

7574
def similarity_search(self, *args, **kwargs):
76-
return self.vss.similarity_search(*args, **kwargs)
75+
data = self.vss.similarity_search(*args, **kwargs)
76+
rag_results = []
77+
for doc in data:
78+
item = {"content": doc.page_content, "url": doc.metadata['url']}
79+
rag_results.append(item)
80+
return rag_results

docker-compose.yaml

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
version: "3"
2+
services:
3+
app:
4+
build:
5+
context: .
6+
image: contentmap_local

0 commit comments

Comments
 (0)