Skip to content

Commit 5865005

Browse files
authoredFeb 26, 2024··
feat: Add sqlite-vss to add similarity search to sqlite (#4)
* feat: Add sqlite-vss to add simioary search to sqlite * feat: Integrate with langchain for sqlite-vss implementation
1 parent 60b7198 commit 5865005

13 files changed

+1927
-5
lines changed
 

‎.github/workflows/ci.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ jobs:
1313
strategy:
1414
fail-fast: true
1515
matrix:
16-
python-version: ["3.8", "3.9", "3.10"]
16+
python-version: ["3.8", "3.9", "3.10", "3.11"]
1717

1818
steps:
1919
- uses: actions/checkout@v3

‎.gitignore

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
contentmap.db
2-
2+
/scratch
33

44
Byte-compiled / optimized / DLL files
55
__pycache__/

‎Dockerfile

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
FROM ubuntu:22.04
2+
3+
4+
RUN apt update
5+
RUN apt install -y python3-pip libgomp1 libatlas-base-dev liblapack-dev libsqlite3-dev
6+
7+
WORKDIR /app
8+
9+
ADD poetry.lock /app/poetry.lock
10+
ADD pyproject.toml /app/pyproject.toml
11+
12+
RUN pip install poetry
13+
RUN poetry config virtualenvs.create false
14+
RUN poetry install
15+
16+
RUN python3 -c 'from sentence_transformers import SentenceTransformer; embedder = SentenceTransformer("all-MiniLM-L6-v2")'
17+
18+
19+
ADD . /app
20+
21+
CMD ["pytest", "./tests"]

‎README.md

+27-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,29 @@
11
# Content map
22

3-
A way to share content from a specific domain. Similar to sitemaps but for content.
3+
A way to share content from a specific domain using SQLite as an alternative to
4+
RSS feeds. The purpose of this library is to simply create a dataset for all the
5+
content on your website, using the XML sitemap as a starting point.
6+
7+
8+
## Installation
9+
10+
```bash
11+
12+
pip install contentmap
13+
14+
```
15+
16+
## Quickstart
17+
18+
To build your contentmap.db that will contain all your content using your XML
19+
sitemap as a starting point, you only need to write the following:
20+
21+
```python
22+
from contentmap.sitemap import SitemapToContentDatabase
23+
24+
database = SitemapToContentDatabase("https://yourblog.com/sitemap.xml")
25+
database.load()
26+
27+
```
28+
29+
You can control how many urls can be crawled concurrently and also set some timeout.

‎contentmap/ftse.py

Whitespace-only changes.

‎contentmap/vss.py

+76
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
"""
2+
Class ContentMapVSS to create vector search dataset from a contentmap
3+
dataset already created.
4+
"""
5+
import sqlite3
6+
from typing import Optional
7+
8+
import sqlite_vss
9+
from langchain.text_splitter import RecursiveCharacterTextSplitter
10+
from langchain.text_splitter import CharacterTextSplitter
11+
from langchain_community.document_loaders import TextLoader
12+
from langchain_community.embeddings.sentence_transformer import (
13+
SentenceTransformerEmbeddings,
14+
)
15+
from langchain_community.vectorstores import SQLiteVSS
16+
17+
18+
class ContentMapVSS:
19+
20+
def __init__(self,
21+
connection: Optional[sqlite3.Connection] = None,
22+
db_file: str = "contentmap.db"
23+
):
24+
25+
self.connection = connection
26+
if not connection:
27+
self.connection = SQLiteVSS.create_connection(db_file)
28+
29+
embedding_function = SentenceTransformerEmbeddings(
30+
model_name="all-MiniLM-L6-v2"
31+
)
32+
self.vss = SQLiteVSS(
33+
table="content_chunks",
34+
embedding=embedding_function,
35+
connection=self.connection
36+
)
37+
38+
def load(self):
39+
# content table must be there
40+
assert self.table_exists(table_name="content")
41+
texts, metadatas = self.prepare_texts_and_metadatas()
42+
self.vss.add_texts(texts=texts, metadatas=metadatas)
43+
return self.vss
44+
45+
def table_exists(self, table_name: str) -> bool:
46+
res = self.connection.execute(f"""
47+
SELECT name
48+
FROM sqlite_master
49+
WHERE type='table' AND name='{table_name}';
50+
""")
51+
rows = res.fetchall()
52+
if len(rows) == 1:
53+
return True
54+
return False
55+
56+
def prepare_texts_and_metadatas(self):
57+
cursor = self.connection.cursor()
58+
result = cursor.execute("SELECT content, url FROM content")
59+
rows = result.fetchall()
60+
61+
# based on Anyscale analysis (https://t.ly/yjgxQ), it looks like the
62+
# sweet spot is 700 chunk size and 50 chunk overlap
63+
text_splitter = CharacterTextSplitter(chunk_size=700, chunk_overlap=50)
64+
65+
texts = []
66+
metadatas = []
67+
for row in rows:
68+
chunks = text_splitter.split_text(row["content"])
69+
chunk_metadatas = [{"url": row["url"]} for _ in chunks]
70+
texts += chunks
71+
metadatas += chunk_metadatas
72+
73+
return texts, metadatas
74+
75+
def similarity_search(self, *args, **kwargs):
76+
return self.vss.similarity_search(*args, **kwargs)

‎poetry.lock

+1,720-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

‎pyproject.toml

+3
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,9 @@ tqdm = "^4.66.1"
1414
lxml = "4.9.4"
1515
trafilatura = "^1.6.4"
1616
aiohttp = "^3.9.1"
17+
sqlite-vss = "^0.1.2"
18+
langchain = "^0.1.8"
19+
sentence-transformers = "^2.3.1"
1720

1821

1922
[tool.poetry.group.test.dependencies]

‎tests/conftest.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import pytest
22
import os
3+
import os.path as op
4+
import logging
35

46

57
@pytest.fixture(autouse=True)
@@ -8,5 +10,9 @@ def remove_created_database_after_test():
810
# Setup logic
911
yield # this is where the testing happens
1012
# Teardown logic
11-
os.remove("contentmap.db")
13+
14+
contentmap_db_path = op.join(op.dirname(__file__), "contentmap.db")
15+
if op.exists(contentmap_db_path):
16+
logging.info('Destroying mock sqlite content instance')
17+
os.remove(contentmap_db_path)
1218

‎tests/fixture.db

20 KB
Binary file not shown.

‎tests/test_fts.py

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import sqlite3
2+
3+
4+
def test_fts_extension_enabled():
5+
6+
con = sqlite3.connect(':memory:')
7+
cur = con.cursor()
8+
cur.execute('pragma compile_options;')
9+
available_pragmas = cur.fetchall()
10+
con.close()
11+
12+
assert ('ENABLE_FTS5',) in available_pragmas

‎tests/test_vss.py

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
from contentmap.vss import ContentMapVSS
2+
import os.path as op
3+
from tests.utils import build_fixture_db
4+
5+
6+
class TestContentMapVSS:
7+
8+
def test_assertion_content_exists(self):
9+
fixture_db = op.join(op.dirname(__file__), "fixture.db")
10+
vss_content = ContentMapVSS(db_file=fixture_db)
11+
assert vss_content.table_exists(table_name="content") is True
12+
13+
def test_assertion_content_not_exists(self):
14+
vss_content = ContentMapVSS(db_file=":memory:")
15+
assert vss_content.table_exists(table_name="content") is False
16+
17+
18+
class TestVssTablesCreation:
19+
20+
def test_vss_instance(self):
21+
db = build_fixture_db()
22+
cm_vss = ContentMapVSS(db_file=db)
23+
cm_vss.load()
24+
assert cm_vss.table_exists("content_chunks")
25+
26+
def test_prepare_texts_and_metadatas(self):
27+
db = build_fixture_db()
28+
cm_vss = ContentMapVSS(db_file=db)
29+
texts, metadatas = cm_vss.prepare_texts_and_metadatas()
30+
assert len(texts) == len(metadatas) >= 1
31+
32+
def test_chunk_table(self):
33+
db = build_fixture_db()
34+
cm_vss = ContentMapVSS(db_file=db)
35+
cm_vss.load()
36+
assert cm_vss.table_exists("content_chunks")
37+
cursor = cm_vss.connection.cursor()
38+
res = cursor.execute("SELECT * FROM content_chunks")
39+
rows = res.fetchall()
40+
assert len(rows) >= 15
41+
42+
def test_similarity_search(self):
43+
db = build_fixture_db()
44+
cm_vss = ContentMapVSS(db_file=db)
45+
cm_vss.load()
46+
data = cm_vss.similarity_search(query="who is Mistral ai company?", k=2)
47+
assert len(data) == 2
48+
metadatas = [doc.metadata for doc in data]
49+
for meta in metadatas:
50+
assert meta.get("url") == "https://philippeoger.com/pages/ai-scene-in-europe-last-week/"

‎tests/utils.py

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
import os.path as op
2+
import shutil
3+
4+
5+
def build_fixture_db():
6+
fixture_db = op.join(op.dirname(__file__), 'fixture.db')
7+
dest = op.join(op.dirname(__file__), 'contentmap.db')
8+
shutil.copy(fixture_db, dest)
9+
return dest

0 commit comments

Comments
 (0)
Please sign in to comment.