Skip to content

Commit c0d9484

Browse files
authored
Merge pull request #948 from matthias/feature/CheerioSelector
Added CSS selector to Cheerio
2 parents 1722479 + 173b645 commit c0d9484

File tree

1 file changed

+21
-3
lines changed
  • packages/components/nodes/documentloaders/Cheerio

1 file changed

+21
-3
lines changed

packages/components/nodes/documentloaders/Cheerio/Cheerio.ts

+21-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
import { INode, INodeData, INodeParams } from '../../../src/Interface'
22
import { TextSplitter } from 'langchain/text_splitter'
3-
import { CheerioWebBaseLoader } from 'langchain/document_loaders/web/cheerio'
3+
import { CheerioWebBaseLoader, WebBaseLoaderParams } from 'langchain/document_loaders/web/cheerio'
44
import { test } from 'linkifyjs'
5+
import { parse } from 'css-what'
56
import { webCrawl, xmlScrape } from '../../../src'
7+
import { SelectorType } from 'cheerio'
68

79
class Cheerio_DocumentLoaders implements INode {
810
label: string
@@ -18,7 +20,7 @@ class Cheerio_DocumentLoaders implements INode {
1820
constructor() {
1921
this.label = 'Cheerio Web Scraper'
2022
this.name = 'cheerioWebScraper'
21-
this.version = 1.0
23+
this.version = 1.1
2224
this.type = 'Document'
2325
this.icon = 'cheerio.svg'
2426
this.category = 'Document Loaders'
@@ -66,6 +68,14 @@ class Cheerio_DocumentLoaders implements INode {
6668
'Only used when "Get Relative Links Method" is selected. Set 0 to retrieve all relative links, default limit is 10.',
6769
warning: `Retrieving all links might take long time, and all links will be upserted again if the flow's state changed (eg: different URL, chunk size, etc)`
6870
},
71+
{
72+
label: 'Selector (CSS)',
73+
name: 'selector',
74+
type: 'string',
75+
description: 'Specify a CSS selector to select the content to be extracted',
76+
optional: true,
77+
additionalParams: true
78+
},
6979
{
7080
label: 'Metadata',
7181
name: 'metadata',
@@ -88,10 +98,18 @@ class Cheerio_DocumentLoaders implements INode {
8898
throw new Error('Invalid URL')
8999
}
90100

101+
const selector: SelectorType = nodeData.inputs?.selector as SelectorType
102+
103+
let params: WebBaseLoaderParams = {}
104+
if (selector) {
105+
parse(selector) // comes with cheerio - will throw error if invalid
106+
params['selector'] = selector
107+
}
108+
91109
async function cheerioLoader(url: string): Promise<any> {
92110
try {
93111
let docs = []
94-
const loader = new CheerioWebBaseLoader(url)
112+
const loader = new CheerioWebBaseLoader(url, params)
95113
if (textSplitter) {
96114
docs = await loader.loadAndSplit(textSplitter)
97115
} else {

0 commit comments

Comments
 (0)