|
| 1 | +import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface' |
| 2 | +import { UnstructuredDirectoryLoader, UnstructuredLoaderOptions } from 'langchain/document_loaders/fs/unstructured' |
| 3 | +import { getCredentialData, getCredentialParam } from '../../../src/utils' |
| 4 | + |
| 5 | +class UnstructuredFolder_DocumentLoaders implements INode { |
| 6 | + label: string |
| 7 | + name: string |
| 8 | + version: number |
| 9 | + description: string |
| 10 | + type: string |
| 11 | + icon: string |
| 12 | + category: string |
| 13 | + baseClasses: string[] |
| 14 | + credential: INodeParams |
| 15 | + inputs: INodeParams[] |
| 16 | + |
| 17 | + constructor() { |
| 18 | + this.label = 'Unstructured Folder Loader' |
| 19 | + this.name = 'unstructuredFolderLoader' |
| 20 | + this.version = 1.0 |
| 21 | + this.type = 'Document' |
| 22 | + this.icon = 'unstructured.png' |
| 23 | + this.category = 'Document Loaders' |
| 24 | + this.description = 'Use Unstructured.io to load data from a folder' |
| 25 | + this.baseClasses = [this.type] |
| 26 | + this.credential = { |
| 27 | + label: 'Connect Credential', |
| 28 | + name: 'credential', |
| 29 | + type: 'credential', |
| 30 | + credentialNames: ['unstructuredApi'], |
| 31 | + optional: true |
| 32 | + } |
| 33 | + this.inputs = [ |
| 34 | + { |
| 35 | + label: 'Folder Path', |
| 36 | + name: 'folderPath', |
| 37 | + type: 'string', |
| 38 | + placeholder: '' |
| 39 | + }, |
| 40 | + { |
| 41 | + label: 'Unstructured API URL', |
| 42 | + name: 'unstructuredAPIUrl', |
| 43 | + description: |
| 44 | + 'Unstructured API URL. Read <a target="_blank" href="https://unstructured-io.github.io/unstructured/introduction.html#getting-started">more</a> on how to get started', |
| 45 | + type: 'string', |
| 46 | + default: 'http://localhost:8000/general/v0/general' |
| 47 | + }, |
| 48 | + { |
| 49 | + label: 'Element Type', |
| 50 | + name: 'elementType', |
| 51 | + description: |
| 52 | + 'Unstructured partition document into different types, select the types to return. If not selected, all types will be returned', |
| 53 | + type: 'multiOptions', |
| 54 | + options: [ |
| 55 | + { |
| 56 | + label: 'FigureCaption', |
| 57 | + name: 'FigureCaption' |
| 58 | + }, |
| 59 | + { |
| 60 | + label: 'NarrativeText', |
| 61 | + name: 'NarrativeText' |
| 62 | + }, |
| 63 | + { |
| 64 | + label: 'ListItem', |
| 65 | + name: 'ListItem' |
| 66 | + }, |
| 67 | + { |
| 68 | + label: 'Title', |
| 69 | + name: 'Title' |
| 70 | + }, |
| 71 | + { |
| 72 | + label: 'Address', |
| 73 | + name: 'Address' |
| 74 | + }, |
| 75 | + { |
| 76 | + label: 'Table', |
| 77 | + name: 'Table' |
| 78 | + }, |
| 79 | + { |
| 80 | + label: 'PageBreak', |
| 81 | + name: 'PageBreak' |
| 82 | + }, |
| 83 | + { |
| 84 | + label: 'Header', |
| 85 | + name: 'Header' |
| 86 | + }, |
| 87 | + { |
| 88 | + label: 'Footer', |
| 89 | + name: 'Footer' |
| 90 | + }, |
| 91 | + { |
| 92 | + label: 'UncategorizedText', |
| 93 | + name: 'UncategorizedText' |
| 94 | + }, |
| 95 | + { |
| 96 | + label: 'Image', |
| 97 | + name: 'Image' |
| 98 | + }, |
| 99 | + { |
| 100 | + label: 'Formula', |
| 101 | + name: 'Formula' |
| 102 | + } |
| 103 | + ], |
| 104 | + default: [], |
| 105 | + optional: true, |
| 106 | + additionalParams: true |
| 107 | + }, |
| 108 | + { |
| 109 | + label: 'Metadata', |
| 110 | + name: 'metadata', |
| 111 | + type: 'json', |
| 112 | + optional: true, |
| 113 | + additionalParams: true |
| 114 | + } |
| 115 | + ] |
| 116 | + } |
| 117 | + |
| 118 | + async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> { |
| 119 | + const folderPath = nodeData.inputs?.folderPath as string |
| 120 | + const unstructuredAPIUrl = nodeData.inputs?.unstructuredAPIUrl as string |
| 121 | + const metadata = nodeData.inputs?.metadata |
| 122 | + const elementType = nodeData.inputs?.elementType as string |
| 123 | + |
| 124 | + const obj: UnstructuredLoaderOptions = { apiUrl: unstructuredAPIUrl } |
| 125 | + |
| 126 | + const credentialData = await getCredentialData(nodeData.credential ?? '', options) |
| 127 | + const unstructuredAPIKey = getCredentialParam('unstructuredAPIKey', credentialData, nodeData) |
| 128 | + if (unstructuredAPIKey) obj.apiKey = unstructuredAPIKey |
| 129 | + |
| 130 | + const loader = new UnstructuredDirectoryLoader(folderPath, obj) |
| 131 | + const docs = await loader.load() |
| 132 | + |
| 133 | + let elementTypes: string[] = [] |
| 134 | + if (elementType) { |
| 135 | + try { |
| 136 | + elementTypes = JSON.parse(elementType) |
| 137 | + } catch (e) { |
| 138 | + elementTypes = [] |
| 139 | + } |
| 140 | + } |
| 141 | + |
| 142 | + if (metadata) { |
| 143 | + const parsedMetadata = typeof metadata === 'object' ? metadata : JSON.parse(metadata) |
| 144 | + let finaldocs = [] |
| 145 | + for (const doc of docs) { |
| 146 | + const newdoc = { |
| 147 | + ...doc, |
| 148 | + metadata: { |
| 149 | + ...doc.metadata, |
| 150 | + ...parsedMetadata |
| 151 | + } |
| 152 | + } |
| 153 | + finaldocs.push(newdoc) |
| 154 | + } |
| 155 | + return elementTypes.length ? finaldocs.filter((doc) => elementTypes.includes(doc.metadata.category)) : finaldocs |
| 156 | + } |
| 157 | + |
| 158 | + return elementTypes.length ? docs.filter((doc) => elementTypes.includes(doc.metadata.category)) : docs |
| 159 | + } |
| 160 | +} |
| 161 | + |
| 162 | +module.exports = { nodeClass: UnstructuredFolder_DocumentLoaders } |
0 commit comments