Skip to content

Commit 41ca718

Browse files
committed
compile quotes into topic clusters
1 parent 79c61f4 commit 41ca718

6 files changed

+671
-1
lines changed

.DS_Store

0 Bytes
Binary file not shown.

.gitignore

+3-1
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,6 @@ node_modules
33
data*.json
44
amazon_links.json
55
records*.json
6-
books-notion-table.csv
6+
books-notion-table.csv
7+
kindle-highlights
8+
kindle-export-supabase.js

book-covers-db.js

Whitespace-only changes.

compile-quotes.js

+183
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
/*
2+
compile all quotes from supabase where BookID = X.
3+
Then use GPT to cluster them into a few categories.
4+
Then create a easy to read story about highlights.
5+
*/
6+
7+
import { createClient } from "@supabase/supabase-js";
8+
import dotenv from "dotenv";
9+
import fs from "fs";
10+
import similarity from "compute-cosine-similarity";
11+
import { Configuration, OpenAIApi } from "openai";
12+
13+
dotenv.config();
14+
15+
const configuration = new Configuration({
16+
apiKey: process.env.OPENAI_API_KEY,
17+
organization: process.env.OPENAI_ORG,
18+
});
19+
20+
const openai = new OpenAIApi(configuration);
21+
22+
dotenv.config();
23+
24+
const CLUSTER_THRESHOLD = 0.8;
25+
const BOOK_ID = "33431849";
26+
27+
const supabaseUrl = process.env.SUPABASE_URL;
28+
const supabaseKey = process.env.SUPABASE_KEY;
29+
const supabase = createClient(supabaseUrl, supabaseKey, {
30+
auth: {
31+
persistSession: false,
32+
},
33+
});
34+
35+
const getQuotes = async (bookId) => {
36+
const { data, error } = await supabase
37+
.from("highlights")
38+
.select("*")
39+
.eq("book_id", bookId);
40+
41+
if (error) {
42+
console.error(error);
43+
return [];
44+
}
45+
46+
return data;
47+
};
48+
49+
const getBookIDFromTitle = async (title) => {
50+
const { data, error } = await supabase
51+
.from("books")
52+
.select("book_id")
53+
.ilike("title", "%" + title + "%");
54+
55+
if (error) {
56+
console.error(error);
57+
return null;
58+
}
59+
60+
// if undefined, return null
61+
if (data.length === 0) {
62+
return null;
63+
}
64+
65+
return data[0].book_id;
66+
};
67+
68+
const compileQuotesFromTitle = async (bookTitle) => {
69+
const bookId = await getBookIDFromTitle(bookTitle);
70+
if (!bookId) {
71+
console.error("Book not found.");
72+
return [];
73+
}
74+
75+
const quotes = await getQuotes(bookId);
76+
return quotes;
77+
};
78+
79+
const compileQuotesFomID = async (bookID) => {
80+
const quotes = await getQuotes(bookID);
81+
return quotes;
82+
};
83+
84+
function clusterEmbeddings(quotes, threshold = 0.01) {
85+
let clusters = [];
86+
let clusterIndex = 0;
87+
let similarityAvg = 0;
88+
let total = 0;
89+
90+
quotes.forEach((quote, index) => {
91+
if (quote.cluster === undefined) {
92+
quote.cluster = clusterIndex;
93+
clusters[clusterIndex] = [quote];
94+
95+
quotes.forEach((otherQuote, otherIndex) => {
96+
const quoteEmbedding = JSON.parse(quote.embedding);
97+
const otherQuoteEmbedding = JSON.parse(otherQuote.embedding);
98+
99+
if (index === otherIndex) return;
100+
const _similarity = similarity(quoteEmbedding, otherQuoteEmbedding);
101+
if (otherQuote.cluster === undefined) {
102+
if (_similarity > threshold) {
103+
otherQuote.cluster = clusterIndex;
104+
clusters[clusterIndex].push(otherQuote);
105+
}
106+
}
107+
108+
total++;
109+
similarityAvg += _similarity;
110+
});
111+
112+
clusterIndex++;
113+
}
114+
});
115+
116+
console.log("Average similarity: " + similarityAvg / total);
117+
118+
return clusters;
119+
}
120+
121+
const assignTopicToCluster = async (cluster) => {
122+
try {
123+
const prompt = `Given the following quotes, what is a good topic for them? Return only the topic as a Markdown heading with no leading #`;
124+
125+
const completion = await openai.createChatCompletion({
126+
messages: [
127+
{
128+
role: "system",
129+
content: prompt,
130+
},
131+
{
132+
role: "user",
133+
content: cluster.map((quote) => quote.text).join("\n"),
134+
},
135+
],
136+
model: "gpt-3.5-turbo",
137+
});
138+
139+
const content = completion.data.choices[0].message.content;
140+
141+
return content.trim().replace(/#/, "");
142+
} catch (err) {
143+
console.log("START ERROR");
144+
console.error(err);
145+
console.error(err.response);
146+
console.error(err.response.data);
147+
console.error(err.response.data.error);
148+
console.error(err.response.data.error.message);
149+
console.error(err.response.data.error.code);
150+
console.error(err.response.data.error.status);
151+
console.error(err.response.data.error.request);
152+
console.log("END ERROR");
153+
throw err;
154+
}
155+
};
156+
157+
const main = async () => {
158+
const quotes = await compileQuotesFomID(BOOK_ID);
159+
160+
console.log(quotes.length + " quotes found.");
161+
162+
let clusteredQuotes = clusterEmbeddings(quotes, CLUSTER_THRESHOLD);
163+
164+
// convert each cluster into a heading and a list of quotes in markdown under it and write to a file
165+
// use cluster index as heading
166+
// each quote is a bullet point under the heading
167+
168+
let markdown = "";
169+
for (let index = 0; index < clusteredQuotes.length; index++) {
170+
const cluster = clusteredQuotes[index];
171+
const topic = await assignTopicToCluster(cluster);
172+
console.log("Cluster " + index + ": " + topic);
173+
markdown += `## Cluster ${index} - ${topic}\n`;
174+
cluster.forEach((quote) => {
175+
markdown += `- ${quote.text}\n`;
176+
});
177+
}
178+
179+
180+
fs.writeFileSync("quotes.md", markdown);
181+
};
182+
183+
main();

0 commit comments

Comments
 (0)