Skip to content

Commit 3f07d61

Browse files
langchain[minor]: Multi-file loader (#5584)
* Multi-file loader * Update imports, add entrypoint, format --------- Co-authored-by: jacoblee93 <[email protected]>
1 parent 5984a6d commit 3f07d61

File tree

7 files changed

+240
-0
lines changed

7 files changed

+240
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
---
2+
sidebar_position: 2
3+
hide_table_of_contents: true
4+
---
5+
6+
# Multiple individual files
7+
8+
This example goes over how to load data from multiple file paths. The second argument is a map of file extensions to loader factories. Each file will be passed to the matching loader, and the resulting documents will be concatenated together.
9+
10+
Example files:
11+
12+
```text
13+
src/document_loaders/example_data/example/
14+
├── example.txt
15+
└── example.csv
16+
17+
src/document_loaders/example_data/example2/
18+
├── example.json
19+
└── example.jsonl
20+
```
21+
22+
Example code:
23+
24+
```typescript
25+
import { MultiFileLoader } from "langchain/document_loaders/fs/multi_file";
26+
import {
27+
JSONLoader,
28+
JSONLinesLoader,
29+
} from "langchain/document_loaders/fs/json";
30+
import { TextLoader } from "langchain/document_loaders/fs/text";
31+
import { CSVLoader } from "langchain/document_loaders/fs/csv";
32+
33+
const loader = new MultiFileLoader(
34+
[
35+
"src/document_loaders/example_data/example/example.txt",
36+
"src/document_loaders/example_data/example/example.csv",
37+
"src/document_loaders/example_data/example2/example.json",
38+
"src/document_loaders/example_data/example2/example.jsonl",
39+
],
40+
{
41+
".json": (path) => new JSONLoader(path, "/texts"),
42+
".jsonl": (path) => new JSONLinesLoader(path, "/html"),
43+
".txt": (path) => new TextLoader(path),
44+
".csv": (path) => new CSVLoader(path, "text"),
45+
}
46+
);
47+
const docs = await loader.load();
48+
console.log({ docs });
49+
```

langchain/.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,10 @@ document_loaders/fs/directory.cjs
274274
document_loaders/fs/directory.js
275275
document_loaders/fs/directory.d.ts
276276
document_loaders/fs/directory.d.cts
277+
document_loaders/fs/multi_file.cjs
278+
document_loaders/fs/multi_file.js
279+
document_loaders/fs/multi_file.d.ts
280+
document_loaders/fs/multi_file.d.cts
277281
document_loaders/fs/buffer.cjs
278282
document_loaders/fs/buffer.js
279283
document_loaders/fs/buffer.d.ts

langchain/langchain.config.js

+2
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ export const config = {
115115
"document_loaders/web/sort_xyz_blockchain",
116116
"document_loaders/web/youtube": "document_loaders/web/youtube",
117117
"document_loaders/fs/directory": "document_loaders/fs/directory",
118+
"document_loaders/fs/multi_file": "document_loaders/fs/multi_file",
118119
"document_loaders/fs/buffer": "document_loaders/fs/buffer",
119120
"document_loaders/fs/chatgpt": "document_loaders/fs/chatgpt",
120121
"document_loaders/fs/text": "document_loaders/fs/text",
@@ -254,6 +255,7 @@ export const config = {
254255
"document_loaders/web/couchbase",
255256
"document_loaders/web/youtube",
256257
"document_loaders/fs/directory",
258+
"document_loaders/fs/multi_file",
257259
"document_loaders/fs/buffer",
258260
"document_loaders/fs/chatgpt",
259261
"document_loaders/fs/text",

langchain/package.json

+13
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,10 @@
286286
"document_loaders/fs/directory.js",
287287
"document_loaders/fs/directory.d.ts",
288288
"document_loaders/fs/directory.d.cts",
289+
"document_loaders/fs/multi_file.cjs",
290+
"document_loaders/fs/multi_file.js",
291+
"document_loaders/fs/multi_file.d.ts",
292+
"document_loaders/fs/multi_file.d.cts",
289293
"document_loaders/fs/buffer.cjs",
290294
"document_loaders/fs/buffer.js",
291295
"document_loaders/fs/buffer.d.ts",
@@ -1540,6 +1544,15 @@
15401544
"import": "./document_loaders/fs/directory.js",
15411545
"require": "./document_loaders/fs/directory.cjs"
15421546
},
1547+
"./document_loaders/fs/multi_file": {
1548+
"types": {
1549+
"import": "./document_loaders/fs/multi_file.d.ts",
1550+
"require": "./document_loaders/fs/multi_file.d.cts",
1551+
"default": "./document_loaders/fs/multi_file.d.ts"
1552+
},
1553+
"import": "./document_loaders/fs/multi_file.js",
1554+
"require": "./document_loaders/fs/multi_file.cjs"
1555+
},
15431556
"./document_loaders/fs/buffer": {
15441557
"types": {
15451558
"import": "./document_loaders/fs/buffer.d.ts",
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
import { extname, resolve } from "node:path";
2+
import { stat } from "node:fs/promises";
3+
import { Document } from "@langchain/core/documents";
4+
import { BaseDocumentLoader } from "../base.js";
5+
import { type LoadersMapping, UnknownHandling } from "./directory.js";
6+
7+
/**
8+
* A document loader that loads documents from multiple files. It extends the
9+
* `BaseDocumentLoader` class and implements the `load()` method.
10+
* @example
11+
* ```typescript
12+
*
13+
* const multiFileLoader = new MultiFileLoader(
14+
* ["path/to/file1.pdf", "path/to/file2.txt"],
15+
* {
16+
* ".pdf": (path: string) => new PDFLoader(path),
17+
* },
18+
* );
19+
*
20+
* const docs = await multiFileLoader.load();
21+
* console.log({ docs });
22+
*
23+
* ```
24+
*/
25+
export class MultiFileLoader extends BaseDocumentLoader {
26+
constructor(
27+
public filePaths: string[],
28+
public loaders: LoadersMapping,
29+
public unknown: UnknownHandling = UnknownHandling.Warn
30+
) {
31+
super();
32+
33+
if (Object.keys(loaders).length === 0) {
34+
throw new Error("Must provide at least one loader");
35+
}
36+
for (const extension in loaders) {
37+
if (Object.hasOwn(loaders, extension)) {
38+
if (extension[0] !== ".") {
39+
throw new Error(`Extension must start with a dot: ${extension}`);
40+
}
41+
}
42+
}
43+
}
44+
45+
/**
46+
* Loads the documents from the provided file paths. It checks if the file
47+
* is a directory and ignores it. If a file is a file, it checks if there
48+
* is a corresponding loader function for the file extension in the `loaders`
49+
* mapping. If there is, it loads the documents. If there is no
50+
* corresponding loader function and `unknown` is set to `Warn`, it logs a
51+
* warning message. If `unknown` is set to `Error`, it throws an error.
52+
* @returns A promise that resolves to an array of loaded documents.
53+
*/
54+
public async load(): Promise<Document[]> {
55+
const documents: Document[] = [];
56+
57+
for (const filePath of this.filePaths) {
58+
const fullPath = resolve(filePath);
59+
const fileStat = await stat(fullPath);
60+
61+
if (fileStat.isDirectory()) {
62+
console.warn(`Ignoring directory: ${fullPath}`);
63+
continue;
64+
}
65+
66+
const loaderFactory = this.loaders[extname(fullPath)];
67+
if (loaderFactory) {
68+
const loader = loaderFactory(fullPath);
69+
documents.push(...(await loader.load()));
70+
} else {
71+
switch (this.unknown) {
72+
case UnknownHandling.Ignore:
73+
break;
74+
case UnknownHandling.Warn:
75+
console.warn(`Unknown file type: ${fullPath}`);
76+
break;
77+
case UnknownHandling.Error:
78+
throw new Error(`Unknown file type: ${fullPath}`);
79+
default:
80+
throw new Error(`Unknown unknown handling: ${this.unknown}`);
81+
}
82+
}
83+
}
84+
85+
return documents;
86+
}
87+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import * as url from "node:url";
2+
import * as path from "node:path";
3+
import { test, expect } from "@jest/globals";
4+
import { MultiFileLoader } from "../fs/multi_file.js";
5+
import { CSVLoader } from "../fs/csv.js";
6+
import { PDFLoader } from "../fs/pdf.js";
7+
import { TextLoader } from "../fs/text.js";
8+
import { JSONLoader } from "../fs/json.js";
9+
import { UnknownHandling } from "../fs/directory.js";
10+
11+
test("Test MultiFileLoader", async () => {
12+
const baseDirectory = path.resolve(
13+
path.dirname(url.fileURLToPath(import.meta.url)),
14+
"./example_data"
15+
);
16+
17+
const filePaths = [
18+
path.resolve(baseDirectory, "1706.03762.pdf"),
19+
path.resolve(baseDirectory, "Jacob_Lee_Resume_2023.pdf"),
20+
path.resolve(
21+
baseDirectory,
22+
"Star_Wars_The_Clone_Wars_S06E07_Crisis_at_the_Heart.csv"
23+
),
24+
path.resolve(
25+
baseDirectory,
26+
"Star_Wars_The_Clone_Wars_S06E07_Crisis_at_the_Heart.json"
27+
),
28+
path.resolve(baseDirectory, "complex.json"),
29+
path.resolve(baseDirectory, "example.txt"),
30+
path.resolve(baseDirectory, "example_separator.csv"),
31+
];
32+
33+
const loader = new MultiFileLoader(
34+
filePaths,
35+
{
36+
".csv": (p) => {
37+
if (p.includes("separator.csv")) {
38+
return new CSVLoader(p, { column: "html", separator: "|" });
39+
}
40+
return new CSVLoader(p, "html");
41+
},
42+
".pdf": (p) => new PDFLoader(p),
43+
".txt": (p) => new TextLoader(p),
44+
".json": (p) => new JSONLoader(p),
45+
},
46+
UnknownHandling.Ignore
47+
);
48+
49+
const docs = await loader.load();
50+
expect(docs.length).toBe(123);
51+
52+
const expectedSources = [
53+
// PDF
54+
...Array.from({ length: 15 }, (_) =>
55+
path.resolve(baseDirectory, "1706.03762.pdf")
56+
),
57+
path.resolve(baseDirectory, "Jacob_Lee_Resume_2023.pdf"),
58+
// CSV
59+
...Array.from({ length: 32 }, (_) =>
60+
path.resolve(
61+
baseDirectory,
62+
"Star_Wars_The_Clone_Wars_S06E07_Crisis_at_the_Heart.csv"
63+
)
64+
),
65+
// JSON
66+
...Array.from({ length: 32 }, (_) =>
67+
path.resolve(
68+
baseDirectory,
69+
"Star_Wars_The_Clone_Wars_S06E07_Crisis_at_the_Heart.json"
70+
)
71+
),
72+
...Array.from({ length: 10 }, (_) =>
73+
path.resolve(baseDirectory, "complex.json")
74+
),
75+
// TXT
76+
path.resolve(baseDirectory, "example.txt"),
77+
// CSV
78+
...Array.from({ length: 32 }, (_) =>
79+
path.resolve(baseDirectory, "example_separator.csv")
80+
),
81+
];
82+
83+
expect(docs.map((d) => d.metadata.source).sort()).toEqual(expectedSources);
84+
});

langchain/src/load/import_constants.ts

+1
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ export const optionalImportEntrypoints: string[] = [
3636
"langchain/document_loaders/web/couchbase",
3737
"langchain/document_loaders/web/youtube",
3838
"langchain/document_loaders/fs/directory",
39+
"langchain/document_loaders/fs/multi_file",
3940
"langchain/document_loaders/fs/buffer",
4041
"langchain/document_loaders/fs/chatgpt",
4142
"langchain/document_loaders/fs/text",

0 commit comments

Comments
 (0)