Skip to content

Commit e2ac7a4

Browse files
committed
Add dependencies and update clustering algorithm
1 parent 34810d8 commit e2ac7a4

7 files changed

+434
-20
lines changed

.DS_Store

0 Bytes
Binary file not shown.

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@ amazon_links.json
55
records*.json
66
books-notion-table.csv
77
kindle-highlights
8-
kindle-export-supabase.js
8+
kindle-export-supabase.js
9+
ocr

compile-quotes.js

+157-18
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ import dotenv from "dotenv";
99
import fs from "fs";
1010
import similarity from "compute-cosine-similarity";
1111
import { Configuration, OpenAIApi } from "openai";
12+
import * as math from "mathjs";
13+
import { kmeans } from "ml-kmeans";
1214

1315
dotenv.config();
1416

@@ -21,8 +23,8 @@ const openai = new OpenAIApi(configuration);
2123

2224
dotenv.config();
2325

24-
const CLUSTER_THRESHOLD = 0.8;
25-
const BOOK_ID = "38318378";
26+
const CLUSTER_THRESHOLD = 0.78;
27+
const BOOK_ID = "38531384";
2628

2729
const supabaseUrl = process.env.SUPABASE_URL;
2830
const supabaseKey = process.env.SUPABASE_KEY;
@@ -32,6 +34,44 @@ const supabase = createClient(supabaseUrl, supabaseKey, {
3234
},
3335
});
3436

37+
// Define the k-means clustering algorithm
38+
function kMeansLocal(data, k) {
39+
// Initialize the centroids
40+
const centroids = [];
41+
let prevCentroids = [];
42+
for (let i = 0; i < k; i++) {
43+
centroids.push(data[Math.floor(Math.random() * data.length)]);
44+
}
45+
46+
// Assign each data point to the closest centroid
47+
const assignments = [];
48+
for (let i = 0; i < data.length; i++) {
49+
const distances = [];
50+
for (let j = 0; j < centroids.length; j++) {
51+
distances.push(math.distance(data[i], centroids[j]));
52+
}
53+
assignments.push(distances.indexOf(Math.min(...distances)));
54+
}
55+
56+
// Update the centroids
57+
for (let i = 0; i < k; i++) {
58+
const cluster = data.filter((d, j) => assignments[j] === i);
59+
if (cluster.length === 0) {
60+
continue;
61+
}
62+
centroids[i] = math.mean(cluster, 0);
63+
}
64+
65+
// Repeat until the centroids no longer change
66+
if (!math.deepEqual(centroids, prevCentroids)) {
67+
prevCentroids = centroids;
68+
return kMeans(data, k);
69+
}
70+
71+
// Return the assignments
72+
return assignments;
73+
}
74+
3575
const getQuotes = async (bookId) => {
3676
const { data, error } = await supabase
3777
.from("highlights")
@@ -120,7 +160,7 @@ function clusterEmbeddings(quotes, threshold = 0.01) {
120160

121161
const assignTopicToCluster = async (cluster) => {
122162
try {
123-
const prompt = `Given the following quotes, what is a good topic for them? Return only the topic as a Markdown heading with no leading #`;
163+
const prompt = `Given the following quotes, what is a good topic for them? Return only the topic as a Markdown heading with no leading #. No bold (**) or italics (*) are needed.`;
124164

125165
const completion = await openai.createChatCompletion({
126166
messages: [
@@ -154,34 +194,133 @@ const assignTopicToCluster = async (cluster) => {
154194
}
155195
};
156196

197+
const summarizeCluster = async (cluster) => {
198+
try {
199+
const prompt = `Given the following quotes, summarize them into a two-three sentence summary. Return only the summary`;
200+
201+
const completion = await openai.createChatCompletion({
202+
messages: [
203+
{
204+
role: "system",
205+
content: prompt,
206+
},
207+
{
208+
role: "user",
209+
content: cluster.map((quote) => quote.text).join("\n"),
210+
},
211+
],
212+
model: "gpt-3.5-turbo",
213+
});
214+
215+
const content = completion.data.choices[0].message.content;
216+
217+
return content.trim();
218+
} catch (err) {
219+
console.log("START ERROR");
220+
console.error(err);
221+
console.error(err.response);
222+
console.error(err.response.data);
223+
console.error(err.response.data.error);
224+
console.error(err.response.data.error.message);
225+
console.error(err.response.data.error.code);
226+
console.error(err.response.data.error.status);
227+
console.error(err.response.data.error.request);
228+
console.log("END ERROR");
229+
throw err;
230+
}
231+
};
232+
233+
const followUpQuestions = async (cluster) => {
234+
try {
235+
const prompt = `Given the following quotes, what are some follow up questions you could ask about them? Return only the questions as a bulleted list`;
236+
237+
const completion = await openai.createChatCompletion({
238+
messages: [
239+
{
240+
role: "system",
241+
content: prompt,
242+
},
243+
{
244+
role: "user",
245+
content: cluster.map((quote) => quote.text).join("\n"),
246+
},
247+
],
248+
model: "gpt-3.5-turbo",
249+
});
250+
251+
const content = completion.data.choices[0].message.content;
252+
253+
return content.trim();
254+
} catch (err) {
255+
console.log("START ERROR");
256+
console.error(err);
257+
console.error(err.response);
258+
console.error(err.response.data);
259+
console.error(err.response.data.error);
260+
console.error(err.response.data.error.message);
261+
console.error(err.response.data.error.code);
262+
console.error(err.response.data.error.status);
263+
console.error(err.response.data.error.request);
264+
console.log("END ERROR");
265+
throw err;
266+
}
267+
};
268+
157269
const main = async () => {
158270
const quotes = await compileQuotesFomID(BOOK_ID);
159271

160272
console.log(quotes.length + " quotes found.");
161273

162-
let clusteredQuotes = clusterEmbeddings(quotes, CLUSTER_THRESHOLD);
274+
// let clusteredQuotes = clusterEmbeddings(quotes, CLUSTER_THRESHOLD);
275+
276+
// Example usage
277+
// const data = [
278+
// [1, 2],
279+
// [3, 4],
280+
// [5, 6],
281+
// [7, 8],
282+
// ];
283+
const k = 5;
284+
const assignments = kmeans(
285+
quotes
286+
.map((quote) => JSON.parse(quote.embedding))
287+
.filter((quote) => quote.length === 1536),
288+
k
289+
);
290+
291+
// console.log(assignments);
163292

164293
// convert each cluster into a heading and a list of quotes in markdown under it and write to a file
165294
// use cluster index as heading
166295
// each quote is a bullet point under the heading
167-
168296
let markdown = "";
169-
let tableOfContents = "";
170-
for (let index = 0; index < clusteredQuotes.length; index++) {
171-
const cluster = clusteredQuotes[index];
172-
const topic = await assignTopicToCluster(cluster);
173-
console.log("Cluster " + index + ": " + topic);
174-
tableOfContents += `${index + 1}. [Cluster ${index} - ${topic}](#cluster-${index}---${topic})\n`;
175-
markdown += `## Cluster ${index} - ${topic}\n`;
176-
cluster.forEach((quote) => {
177-
markdown += `- ${quote.text}\n`;
178-
});
179-
}
180297

181-
markdown = `# Table of Contents\n${tableOfContents}\n${markdown}`;
298+
const clusters = [];
299+
for (let i = 0; i < k; i++) {
300+
clusters.push([]);
301+
}
302+
for (let i = 0; i < assignments.clusters.length; i++) {
303+
clusters[assignments.clusters[i]].push(quotes[i]);
304+
}
182305

306+
for (let i = 0; i < clusters.length; i++) {
307+
const cluster = clusters[i];
308+
console.log(`Cluster ${i} has ${cluster.length} quotes.`);
309+
const topic = await assignTopicToCluster(cluster);
310+
const summary = await summarizeCluster(cluster);
311+
// const followUp = await followUpQuestions(cluster);
312+
313+
markdown += `## ${topic}\n\n`;
314+
markdown += `### Summary\n\n${summary}\n\n`;
315+
markdown += `### Quotes\n\n`;
316+
cluster.forEach((quote) => {
317+
markdown += `- ${quote.text}\n`;
318+
});
319+
markdown += `\n\n`;
320+
// markdown += `### Follow Up Questions\n\n${followUp}\n\n`;
321+
}
183322

184-
fs.writeFileSync("quotes.md", markdown);
323+
fs.writeFileSync("output.md", markdown);
185324
};
186325

187326
// main();

0 commit comments

Comments
 (0)