Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add JSoup HTML document reader #2245

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions document-readers/jsoup-reader/ README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Spring AI JSoup Document Reader

This module provides an HTML document reader for the Spring AI project. It leverages the [JSoup](https://jsoup.org/) library to parse HTML content and extract text and metadata, making it suitable for use in AI applications.

## Features

* **Flexible Text Extraction:**
* Extract all text from the `<body>` of an HTML document.
* Extract text from specific elements using CSS selectors.
* Group text by element, creating a separate document for each selected element.
* Combine text from multiple selected elements using a configurable separator.
* **Metadata Extraction:**
* Extract the document title.
* Extract content from `<meta>` tags (e.g., description, keywords). You can specify which meta tags to extract.
* Extract a list of all absolute URLs of links (`<a href="...">`) within the document.
* **Configurable:**
* Specify the character encoding (defaults to UTF-8).
* Customize the CSS selector for element selection.
* Configure the separator string for joining text from multiple elements.
* Choose whether to extract all text or use element-based extraction.
* Enable/disable link URL extraction.
* Add additional metadata using configuration.
* **Resource-Based:** Works with Spring's `Resource` abstraction, allowing you to read HTML from files, classpath resources, URLs, and even in-memory byte arrays.

---

#### How to Build:
```bash
./mvnw -pl document-readers/jsoup-reader clean install
```
63 changes: 63 additions & 0 deletions document-readers/jsoup-reader/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright 2025-2025 the original author or authors.
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ https://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->

<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>

<artifactId>spring-ai-jsoup-document-reader</artifactId>
<packaging>jar</packaging>
<name>Spring AI Document Reader - HTML</name>
<description>Spring AI HTML document reader</description>
<url>https://github.com/spring-projects/spring-ai</url>

<scm>
<url>https://github.com/spring-projects/spring-ai</url>
<connection>git://github.com/spring-projects/spring-ai.git</connection>
<developerConnection>[email protected]:spring-projects/spring-ai.git</developerConnection>
</scm>

<dependencies>
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-core</artifactId>
<version>${project.parent.version}</version>
</dependency>

<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.18.3</version>
</dependency>

<!-- TESTING -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>

</dependencies>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
/*
* Copyright 2025-2025 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.springframework.ai.reader.jsoup;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import org.springframework.ai.document.Document;
import org.springframework.ai.document.DocumentReader;
import org.springframework.ai.reader.jsoup.config.JsoupDocumentReaderConfig;
import org.springframework.core.io.DefaultResourceLoader;
import org.springframework.core.io.Resource;

/**
* Reads HTML documents and extracts text content using JSoup.
*
* This reader provides options for selecting specific HTML elements to extract, handling
* links, and extracting metadata. It leverages the JSoup library for parsing HTML.
*
* @see <a href="https://jsoup.org/">JSoup Website</a>
* @author Alexandros Pappas
*/
public class JsoupDocumentReader implements DocumentReader {

private final Resource htmlResource;

private final JsoupDocumentReaderConfig config;

public JsoupDocumentReader(String htmlResource) {
this(new DefaultResourceLoader().getResource(htmlResource));
}

public JsoupDocumentReader(Resource htmlResource) {
this(htmlResource, JsoupDocumentReaderConfig.defaultConfig());
}

public JsoupDocumentReader(String htmlResource, JsoupDocumentReaderConfig config) {
this(new DefaultResourceLoader().getResource(htmlResource), config);
}

public JsoupDocumentReader(Resource htmlResource, JsoupDocumentReaderConfig config) {
this.htmlResource = htmlResource;
this.config = config;
}

@Override
public List<Document> get() {
try (InputStream inputStream = htmlResource.getInputStream()) {
org.jsoup.nodes.Document doc = Jsoup.parse(inputStream, this.config.charset, "");

List<Document> documents = new ArrayList<>();

if (this.config.allElements) {
// Extract text from all elements and create a single document
String allText = doc.body().text(); // .body to exclude head
Document document = new Document(allText);
addMetadata(doc, document);
documents.add(document);
}
else if (this.config.groupByElement) {
// Extract text on a per-element base using the defined selector.
Elements selectedElements = doc.select(this.config.selector);
for (Element element : selectedElements) {
String elementText = element.text();
Document document = new Document(elementText);
addMetadata(doc, document);
// Do not add metadata from element to avoid duplication.
documents.add(document);
}
}
else {
// Extract text from specific elements based on the selector
Elements elements = doc.select(this.config.selector);
String text = elements.stream().map(Element::text).collect(Collectors.joining(this.config.separator));
Document document = new Document(text);
addMetadata(doc, document);
documents.add(document);
}

return documents;

}
catch (IOException e) {
throw new RuntimeException("Failed to read HTML resource: " + htmlResource, e);
}
}

private void addMetadata(org.jsoup.nodes.Document jsoupDoc, Document springDoc) {
Map<String, Object> metadata = new HashMap<>();
metadata.put("title", jsoupDoc.title());

for (String metaTag : this.config.metadataTags) {
String value = jsoupDoc.select("meta[name=" + metaTag + "]").attr("content");
if (!value.isEmpty()) {
metadata.put(metaTag, value);
}
}

if (this.config.includeLinkUrls) {
Elements links = jsoupDoc.select("a[href]");
List<String> linkUrls = links.stream().map(link -> link.attr("abs:href")).toList();
metadata.put("linkUrls", linkUrls);
}

// Use putAll to add all entries from additionalMetadata
metadata.putAll(this.config.additionalMetadata);

// Add all collected metadata to the Spring Document
springDoc.getMetadata().putAll(metadata);
}

}
Loading
Loading