Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/unstructured loader #1046

Merged
merged 3 commits into from
Oct 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions packages/components/credentials/UnstructuredApi.credential.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import { INodeParams, INodeCredential } from '../src/Interface'

class UnstructuredApi implements INodeCredential {
label: string
name: string
version: number
description: string
inputs: INodeParams[]

constructor() {
this.label = 'Unstructured API'
this.name = 'unstructuredApi'
this.version = 1.0
this.description =
'Refer to <a target="_blank" href="https://unstructured.io/#get-api-key">official guide</a> on how to get api key on Unstructured'
this.inputs = [
{
label: 'API Key',
name: 'unstructuredAPIKey',
type: 'password'
}
]
}
}

module.exports = { credClass: UnstructuredApi }
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { UnstructuredLoader, UnstructuredLoaderOptions } from 'langchain/document_loaders/fs/unstructured'
import { getCredentialData, getCredentialParam } from '../../../src/utils'

class UnstructuredFile_DocumentLoaders implements INode {
label: string
name: string
version: number
description: string
type: string
icon: string
category: string
baseClasses: string[]
credential: INodeParams
inputs: INodeParams[]

constructor() {
this.label = 'Unstructured File Loader'
this.name = 'unstructuredFileLoader'
this.version = 1.0
this.type = 'Document'
this.icon = 'unstructured.png'
this.category = 'Document Loaders'
this.description = 'Use Unstructured.io to load data from a file path'
this.baseClasses = [this.type]
this.credential = {
label: 'Connect Credential',
name: 'credential',
type: 'credential',
credentialNames: ['unstructuredApi'],
optional: true
}
this.inputs = [
{
label: 'File Path',
name: 'filePath',
type: 'string',
placeholder: ''
},
{
label: 'Unstructured API URL',
name: 'unstructuredAPIUrl',
description:
'Unstructured API URL. Read <a target="_blank" href="https://unstructured-io.github.io/unstructured/introduction.html#getting-started">more</a> on how to get started',
type: 'string',
default: 'http://localhost:8000/general/v0/general'
},
{
label: 'Element Type',
name: 'elementType',
description:
'Unstructured partition document into different types, select the types to return. If not selected, all types will be returned',
type: 'multiOptions',
options: [
{
label: 'FigureCaption',
name: 'FigureCaption'
},
{
label: 'NarrativeText',
name: 'NarrativeText'
},
{
label: 'ListItem',
name: 'ListItem'
},
{
label: 'Title',
name: 'Title'
},
{
label: 'Address',
name: 'Address'
},
{
label: 'Table',
name: 'Table'
},
{
label: 'PageBreak',
name: 'PageBreak'
},
{
label: 'Header',
name: 'Header'
},
{
label: 'Footer',
name: 'Footer'
},
{
label: 'UncategorizedText',
name: 'UncategorizedText'
},
{
label: 'Image',
name: 'Image'
},
{
label: 'Formula',
name: 'Formula'
}
],
default: [],
optional: true,
additionalParams: true
},
{
label: 'Metadata',
name: 'metadata',
type: 'json',
optional: true,
additionalParams: true
}
]
}

async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
const filePath = nodeData.inputs?.filePath as string
const unstructuredAPIUrl = nodeData.inputs?.unstructuredAPIUrl as string
const elementType = nodeData.inputs?.elementType as string
const metadata = nodeData.inputs?.metadata

const obj: UnstructuredLoaderOptions = { apiUrl: unstructuredAPIUrl }

const credentialData = await getCredentialData(nodeData.credential ?? '', options)
const unstructuredAPIKey = getCredentialParam('unstructuredAPIKey', credentialData, nodeData)
if (unstructuredAPIKey) obj.apiKey = unstructuredAPIKey

const loader = new UnstructuredLoader(filePath, obj)
const docs = await loader.load()

let elementTypes: string[] = []
if (elementType) {
try {
elementTypes = JSON.parse(elementType)
} catch (e) {
elementTypes = []
}
}

if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
let finaldocs = []
for (const doc of docs) {
const newdoc = {
...doc,
metadata: {
...doc.metadata,
...parsedMetadata
}
}
finaldocs.push(newdoc)
}
return elementTypes.length ? finaldocs.filter((doc) => elementTypes.includes(doc.metadata.category)) : finaldocs
}

return elementTypes.length ? docs.filter((doc) => elementTypes.includes(doc.metadata.category)) : docs
}
}

module.exports = { nodeClass: UnstructuredFile_DocumentLoaders }
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
import { UnstructuredDirectoryLoader, UnstructuredLoaderOptions } from 'langchain/document_loaders/fs/unstructured'
import { getCredentialData, getCredentialParam } from '../../../src/utils'

class UnstructuredFolder_DocumentLoaders implements INode {
label: string
name: string
version: number
description: string
type: string
icon: string
category: string
baseClasses: string[]
credential: INodeParams
inputs: INodeParams[]

constructor() {
this.label = 'Unstructured Folder Loader'
this.name = 'unstructuredFolderLoader'
this.version = 1.0
this.type = 'Document'
this.icon = 'unstructured.png'
this.category = 'Document Loaders'
this.description = 'Use Unstructured.io to load data from a folder'
this.baseClasses = [this.type]
this.credential = {
label: 'Connect Credential',
name: 'credential',
type: 'credential',
credentialNames: ['unstructuredApi'],
optional: true
}
this.inputs = [
{
label: 'Folder Path',
name: 'folderPath',
type: 'string',
placeholder: ''
},
{
label: 'Unstructured API URL',
name: 'unstructuredAPIUrl',
description:
'Unstructured API URL. Read <a target="_blank" href="https://unstructured-io.github.io/unstructured/introduction.html#getting-started">more</a> on how to get started',
type: 'string',
default: 'http://localhost:8000/general/v0/general'
},
{
label: 'Element Type',
name: 'elementType',
description:
'Unstructured partition document into different types, select the types to return. If not selected, all types will be returned',
type: 'multiOptions',
options: [
{
label: 'FigureCaption',
name: 'FigureCaption'
},
{
label: 'NarrativeText',
name: 'NarrativeText'
},
{
label: 'ListItem',
name: 'ListItem'
},
{
label: 'Title',
name: 'Title'
},
{
label: 'Address',
name: 'Address'
},
{
label: 'Table',
name: 'Table'
},
{
label: 'PageBreak',
name: 'PageBreak'
},
{
label: 'Header',
name: 'Header'
},
{
label: 'Footer',
name: 'Footer'
},
{
label: 'UncategorizedText',
name: 'UncategorizedText'
},
{
label: 'Image',
name: 'Image'
},
{
label: 'Formula',
name: 'Formula'
}
],
default: [],
optional: true,
additionalParams: true
},
{
label: 'Metadata',
name: 'metadata',
type: 'json',
optional: true,
additionalParams: true
}
]
}

async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
const folderPath = nodeData.inputs?.folderPath as string
const unstructuredAPIUrl = nodeData.inputs?.unstructuredAPIUrl as string
const metadata = nodeData.inputs?.metadata
const elementType = nodeData.inputs?.elementType as string

const obj: UnstructuredLoaderOptions = { apiUrl: unstructuredAPIUrl }

const credentialData = await getCredentialData(nodeData.credential ?? '', options)
const unstructuredAPIKey = getCredentialParam('unstructuredAPIKey', credentialData, nodeData)
if (unstructuredAPIKey) obj.apiKey = unstructuredAPIKey

const loader = new UnstructuredDirectoryLoader(folderPath, obj)
const docs = await loader.load()

let elementTypes: string[] = []
if (elementType) {
try {
elementTypes = JSON.parse(elementType)
} catch (e) {
elementTypes = []
}
}

if (metadata) {
const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata)
let finaldocs = []
for (const doc of docs) {
const newdoc = {
...doc,
metadata: {
...doc.metadata,
...parsedMetadata
}
}
finaldocs.push(newdoc)
}
return elementTypes.length ? finaldocs.filter((doc) => elementTypes.includes(doc.metadata.category)) : finaldocs
}

return elementTypes.length ? docs.filter((doc) => elementTypes.includes(doc.metadata.category)) : docs
}
}

module.exports = { nodeClass: UnstructuredFolder_DocumentLoaders }
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class ElasicsearchExisting_VectorStores extends ElasticSearchBase implements INo
async constructVectorStore(
embeddings: Embeddings,
elasticSearchClientArgs: ElasticClientArgs,
docs: Document<Record<string, any>>[] | undefined
_: Document<Record<string, any>>[] | undefined
): Promise<VectorStore> {
return await ElasticVectorSearch.fromExistingIndex(embeddings, elasticSearchClientArgs)
}
Expand Down
1 change: 1 addition & 0 deletions packages/components/src/Interface.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
export type NodeParamsType =
| 'asyncOptions'
| 'options'
| 'multiOptions'
| 'string'
| 'number'
| 'boolean'
Expand Down
Loading