-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.ts
85 lines (75 loc) · 3.3 KB
/
utils.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import { Pinecone, PineconeRecord, RecordMetadata } from "@pinecone-database/pinecone";
import { FeatureExtractionPipeline, pipeline } from "@xenova/transformers";
import { Document } from "langchain/document";
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
import { batchsize } from "./config";
import { HfInference } from '@huggingface/inference';
let callback: (filename: string, totalChunks: number, chunksUpserted: number, isComplete: boolean) => void;
let totalDocumentChunks: number;
let totalDocumentChunksUpseted: number;
export async function updateVectorDB(
client: Pinecone,
indexname: string,
namespace: string,
docs: Document[],
progressCallback: (filename: string, totalChunks: number, chunksUpserted: number, isComplete: boolean) => void
) {
callback = progressCallback;
totalDocumentChunks = 0;
totalDocumentChunksUpseted = 0;
const modelname = 'mixedbread-ai/mxbai-embed-large-v1';
const extractor = await pipeline('feature-extraction', modelname, {
quantized: false
});
console.log(extractor);
for(const doc of docs){
await processDocument(client, indexname, namespace, doc, extractor)
}
if (callback !== undefined) {
callback("filename", totalDocumentChunks, totalDocumentChunksUpseted, true)
}
}
async function processDocument(client: Pinecone, indexname: string, namespace: string, doc: Document<Record<string, any>>, extractor: FeatureExtractionPipeline) {
const splitter = new RecursiveCharacterTextSplitter();
const documentChunks = await splitter.splitText(doc.pageContent);
totalDocumentChunks = documentChunks.length;
totalDocumentChunksUpseted = 0;
const filename = getFilename(doc.metadata.source);
console.log(documentChunks.length);
let chunkBatchIndex = 0;
while(documentChunks.length > 0){
chunkBatchIndex++;
const chunkBatch = documentChunks.splice(0,batchsize)
await processOneBatch(client, indexname, namespace, extractor, chunkBatch, chunkBatchIndex, filename)
}
}
function getFilename(filename: string): string {
const docname = filename.substring(filename.lastIndexOf("/") + 1);
return docname.substring(0, docname.lastIndexOf(".")) || docname;
}
async function processOneBatch(client: Pinecone, indexname: string, namespace: string, extractor: FeatureExtractionPipeline, chunkBatch: string[], chunkBatchIndex: number, filename: string) {
const output = await extractor(chunkBatch.map(str => str.replace(/\n/g, ' ')), {
pooling: 'cls'
});
const embeddingsBatch = output.tolist();
let vectorBatch: PineconeRecord<RecordMetadata>[] = [];
for(let i=0; i <chunkBatch.length; i++){
const chunk = chunkBatch[i];
const embedding = embeddingsBatch[i];
const vector: PineconeRecord<RecordMetadata> = {
id: `${filename}-${chunkBatchIndex}-${i}`,
values: embedding,
metadata: {
chunk
}
}
vectorBatch.push(vector);
}
const index = client.Index(indexname).namespace(namespace);
await index.upsert(vectorBatch);
totalDocumentChunksUpseted += vectorBatch.length;
if (callback !== undefined) {
callback(filename, totalDocumentChunks, totalDocumentChunksUpseted, false)
}
vectorBatch = [];
}