@@ -9,6 +9,8 @@ import dotenv from "dotenv";
9
9
import fs from "fs" ;
10
10
import similarity from "compute-cosine-similarity" ;
11
11
import { Configuration , OpenAIApi } from "openai" ;
12
+ import * as math from "mathjs" ;
13
+ import { kmeans } from "ml-kmeans" ;
12
14
13
15
dotenv . config ( ) ;
14
16
@@ -21,8 +23,8 @@ const openai = new OpenAIApi(configuration);
21
23
22
24
dotenv . config ( ) ;
23
25
24
- const CLUSTER_THRESHOLD = 0.8 ;
25
- const BOOK_ID = "38318378 " ;
26
+ const CLUSTER_THRESHOLD = 0.78 ;
27
+ const BOOK_ID = "38531384 " ;
26
28
27
29
const supabaseUrl = process . env . SUPABASE_URL ;
28
30
const supabaseKey = process . env . SUPABASE_KEY ;
@@ -32,6 +34,44 @@ const supabase = createClient(supabaseUrl, supabaseKey, {
32
34
} ,
33
35
} ) ;
34
36
37
+ // Define the k-means clustering algorithm
38
+ function kMeansLocal ( data , k ) {
39
+ // Initialize the centroids
40
+ const centroids = [ ] ;
41
+ let prevCentroids = [ ] ;
42
+ for ( let i = 0 ; i < k ; i ++ ) {
43
+ centroids . push ( data [ Math . floor ( Math . random ( ) * data . length ) ] ) ;
44
+ }
45
+
46
+ // Assign each data point to the closest centroid
47
+ const assignments = [ ] ;
48
+ for ( let i = 0 ; i < data . length ; i ++ ) {
49
+ const distances = [ ] ;
50
+ for ( let j = 0 ; j < centroids . length ; j ++ ) {
51
+ distances . push ( math . distance ( data [ i ] , centroids [ j ] ) ) ;
52
+ }
53
+ assignments . push ( distances . indexOf ( Math . min ( ...distances ) ) ) ;
54
+ }
55
+
56
+ // Update the centroids
57
+ for ( let i = 0 ; i < k ; i ++ ) {
58
+ const cluster = data . filter ( ( d , j ) => assignments [ j ] === i ) ;
59
+ if ( cluster . length === 0 ) {
60
+ continue ;
61
+ }
62
+ centroids [ i ] = math . mean ( cluster , 0 ) ;
63
+ }
64
+
65
+ // Repeat until the centroids no longer change
66
+ if ( ! math . deepEqual ( centroids , prevCentroids ) ) {
67
+ prevCentroids = centroids ;
68
+ return kMeans ( data , k ) ;
69
+ }
70
+
71
+ // Return the assignments
72
+ return assignments ;
73
+ }
74
+
35
75
const getQuotes = async ( bookId ) => {
36
76
const { data, error } = await supabase
37
77
. from ( "highlights" )
@@ -120,7 +160,7 @@ function clusterEmbeddings(quotes, threshold = 0.01) {
120
160
121
161
const assignTopicToCluster = async ( cluster ) => {
122
162
try {
123
- const prompt = `Given the following quotes, what is a good topic for them? Return only the topic as a Markdown heading with no leading #` ;
163
+ const prompt = `Given the following quotes, what is a good topic for them? Return only the topic as a Markdown heading with no leading #. No bold (**) or italics (*) are needed. ` ;
124
164
125
165
const completion = await openai . createChatCompletion ( {
126
166
messages : [
@@ -154,34 +194,133 @@ const assignTopicToCluster = async (cluster) => {
154
194
}
155
195
} ;
156
196
197
+ const summarizeCluster = async ( cluster ) => {
198
+ try {
199
+ const prompt = `Given the following quotes, summarize them into a two-three sentence summary. Return only the summary` ;
200
+
201
+ const completion = await openai . createChatCompletion ( {
202
+ messages : [
203
+ {
204
+ role : "system" ,
205
+ content : prompt ,
206
+ } ,
207
+ {
208
+ role : "user" ,
209
+ content : cluster . map ( ( quote ) => quote . text ) . join ( "\n" ) ,
210
+ } ,
211
+ ] ,
212
+ model : "gpt-3.5-turbo" ,
213
+ } ) ;
214
+
215
+ const content = completion . data . choices [ 0 ] . message . content ;
216
+
217
+ return content . trim ( ) ;
218
+ } catch ( err ) {
219
+ console . log ( "START ERROR" ) ;
220
+ console . error ( err ) ;
221
+ console . error ( err . response ) ;
222
+ console . error ( err . response . data ) ;
223
+ console . error ( err . response . data . error ) ;
224
+ console . error ( err . response . data . error . message ) ;
225
+ console . error ( err . response . data . error . code ) ;
226
+ console . error ( err . response . data . error . status ) ;
227
+ console . error ( err . response . data . error . request ) ;
228
+ console . log ( "END ERROR" ) ;
229
+ throw err ;
230
+ }
231
+ } ;
232
+
233
+ const followUpQuestions = async ( cluster ) => {
234
+ try {
235
+ const prompt = `Given the following quotes, what are some follow up questions you could ask about them? Return only the questions as a bulleted list` ;
236
+
237
+ const completion = await openai . createChatCompletion ( {
238
+ messages : [
239
+ {
240
+ role : "system" ,
241
+ content : prompt ,
242
+ } ,
243
+ {
244
+ role : "user" ,
245
+ content : cluster . map ( ( quote ) => quote . text ) . join ( "\n" ) ,
246
+ } ,
247
+ ] ,
248
+ model : "gpt-3.5-turbo" ,
249
+ } ) ;
250
+
251
+ const content = completion . data . choices [ 0 ] . message . content ;
252
+
253
+ return content . trim ( ) ;
254
+ } catch ( err ) {
255
+ console . log ( "START ERROR" ) ;
256
+ console . error ( err ) ;
257
+ console . error ( err . response ) ;
258
+ console . error ( err . response . data ) ;
259
+ console . error ( err . response . data . error ) ;
260
+ console . error ( err . response . data . error . message ) ;
261
+ console . error ( err . response . data . error . code ) ;
262
+ console . error ( err . response . data . error . status ) ;
263
+ console . error ( err . response . data . error . request ) ;
264
+ console . log ( "END ERROR" ) ;
265
+ throw err ;
266
+ }
267
+ } ;
268
+
157
269
const main = async ( ) => {
158
270
const quotes = await compileQuotesFomID ( BOOK_ID ) ;
159
271
160
272
console . log ( quotes . length + " quotes found." ) ;
161
273
162
- let clusteredQuotes = clusterEmbeddings ( quotes , CLUSTER_THRESHOLD ) ;
274
+ // let clusteredQuotes = clusterEmbeddings(quotes, CLUSTER_THRESHOLD);
275
+
276
+ // Example usage
277
+ // const data = [
278
+ // [1, 2],
279
+ // [3, 4],
280
+ // [5, 6],
281
+ // [7, 8],
282
+ // ];
283
+ const k = 5 ;
284
+ const assignments = kmeans (
285
+ quotes
286
+ . map ( ( quote ) => JSON . parse ( quote . embedding ) )
287
+ . filter ( ( quote ) => quote . length === 1536 ) ,
288
+ k
289
+ ) ;
290
+
291
+ // console.log(assignments);
163
292
164
293
// convert each cluster into a heading and a list of quotes in markdown under it and write to a file
165
294
// use cluster index as heading
166
295
// each quote is a bullet point under the heading
167
-
168
296
let markdown = "" ;
169
- let tableOfContents = "" ;
170
- for ( let index = 0 ; index < clusteredQuotes . length ; index ++ ) {
171
- const cluster = clusteredQuotes [ index ] ;
172
- const topic = await assignTopicToCluster ( cluster ) ;
173
- console . log ( "Cluster " + index + ": " + topic ) ;
174
- tableOfContents += `${ index + 1 } . [Cluster ${ index } - ${ topic } ](#cluster-${ index } ---${ topic } )\n` ;
175
- markdown += `## Cluster ${ index } - ${ topic } \n` ;
176
- cluster . forEach ( ( quote ) => {
177
- markdown += `- ${ quote . text } \n` ;
178
- } ) ;
179
- }
180
297
181
- markdown = `# Table of Contents\n${ tableOfContents } \n${ markdown } ` ;
298
+ const clusters = [ ] ;
299
+ for ( let i = 0 ; i < k ; i ++ ) {
300
+ clusters . push ( [ ] ) ;
301
+ }
302
+ for ( let i = 0 ; i < assignments . clusters . length ; i ++ ) {
303
+ clusters [ assignments . clusters [ i ] ] . push ( quotes [ i ] ) ;
304
+ }
182
305
306
+ for ( let i = 0 ; i < clusters . length ; i ++ ) {
307
+ const cluster = clusters [ i ] ;
308
+ console . log ( `Cluster ${ i } has ${ cluster . length } quotes.` ) ;
309
+ const topic = await assignTopicToCluster ( cluster ) ;
310
+ const summary = await summarizeCluster ( cluster ) ;
311
+ // const followUp = await followUpQuestions(cluster);
312
+
313
+ markdown += `## ${ topic } \n\n` ;
314
+ markdown += `### Summary\n\n${ summary } \n\n` ;
315
+ markdown += `### Quotes\n\n` ;
316
+ cluster . forEach ( ( quote ) => {
317
+ markdown += `- ${ quote . text } \n` ;
318
+ } ) ;
319
+ markdown += `\n\n` ;
320
+ // markdown += `### Follow Up Questions\n\n${followUp}\n\n`;
321
+ }
183
322
184
- fs . writeFileSync ( "quotes .md" , markdown ) ;
323
+ fs . writeFileSync ( "output .md" , markdown ) ;
185
324
} ;
186
325
187
326
// main();
0 commit comments