3
3
"""
4
4
5
5
from langchain .text_splitter import RecursiveCharacterTextSplitter
6
- from langchain_community .document_transformers import Html2TextTransformer
7
6
from langchain .docstore .document import Document
7
+ from langchain .retrievers import ContextualCompressionRetriever
8
+ from langchain .retrievers .document_compressors import EmbeddingsFilter , DocumentCompressorPipeline
9
+ from langchain_community .document_transformers import Html2TextTransformer , EmbeddingsRedundantFilter
8
10
from langchain_community .vectorstores import FAISS
9
11
from langchain_openai import OpenAIEmbeddings
10
- from langchain .retrievers import ContextualCompressionRetriever
11
- from langchain .retrievers .document_compressors import EmbeddingsFilter
12
- from langchain .retrievers .document_compressors import DocumentCompressorPipeline
13
- from langchain_community .document_transformers import EmbeddingsRedundantFilter
12
+
14
13
15
14
from .base_node import BaseNode
16
15
@@ -77,7 +76,8 @@ def execute(self, state):
77
76
chunk_overlap = 0 ,
78
77
)
79
78
80
- docs_transformed = Html2TextTransformer ().transform_documents (document )[0 ]
79
+ docs_transformed = Html2TextTransformer (
80
+ ).transform_documents (document )[0 ]
81
81
82
82
chunks = text_splitter .split_text (docs_transformed .page_content )
83
83
chunked_docs = []
@@ -90,12 +90,15 @@ def execute(self, state):
90
90
},
91
91
)
92
92
chunked_docs .append (doc )
93
-
93
+
94
94
openai_key = self .llm .openai_api_key
95
- retriever = FAISS .from_documents (chunked_docs , OpenAIEmbeddings (api_key = openai_key )).as_retriever ()
96
- embeddings = OpenAIEmbeddings (api_key = openai_key ) # could be any embedding of your choice
95
+ retriever = FAISS .from_documents (chunked_docs ,
96
+ OpenAIEmbeddings (api_key = openai_key )).as_retriever ()
97
+ # could be any embedding of your choice
98
+ embeddings = OpenAIEmbeddings (api_key = openai_key )
97
99
redundant_filter = EmbeddingsRedundantFilter (embeddings = embeddings )
98
- relevant_filter = EmbeddingsFilter (embeddings = embeddings ) # similarity_threshold could be set, now k=20
100
+ # similarity_threshold could be set, now k=20
101
+ relevant_filter = EmbeddingsFilter (embeddings = embeddings )
99
102
pipeline_compressor = DocumentCompressorPipeline (
100
103
transformers = [redundant_filter , relevant_filter ]
101
104
)
@@ -104,7 +107,8 @@ def execute(self, state):
104
107
base_compressor = pipeline_compressor , base_retriever = retriever
105
108
)
106
109
107
- compressed_docs = compression_retriever .get_relevant_documents (user_input )
110
+ compressed_docs = compression_retriever .get_relevant_documents (
111
+ user_input )
108
112
print ("Documents compressed and stored in a vector database." )
109
113
state .update ({"relevant_chunks" : compressed_docs })
110
114
return state
0 commit comments