Skip to content

Commit

Permalink
feat: add JSoup HTML document reader
Browse files Browse the repository at this point in the history
This commit introduces the `JsoupDocumentReader` and `JsoupDocumentReaderConfig` classes, which provide functionality to read and parse HTML documents using the JSoup library.

The reader supports:
- Extracting text from specific HTML elements using CSS selectors.
- Extracting all text from the body of the document.
- Grouping text by element.
- Extracting metadata, including the document title, meta tags, and link URLs.
- Reading from various resource types (files, URLs, byte arrays).
- Configurable character encoding, selector, separator, and metadata extraction.

This new reader enhances Spring AI's ability to process web content and other HTML-based data sources.
  • Loading branch information
apappascs committed Feb 14, 2025
1 parent c623264 commit 2879e6c
Show file tree
Hide file tree
Showing 11 changed files with 759 additions and 0 deletions.
30 changes: 30 additions & 0 deletions document-readers/jsoup-reader/ README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Spring AI JSoup Document Reader

This module provides an HTML document reader for the Spring AI project. It leverages the [JSoup](https://jsoup.org/) library to parse HTML content and extract text and metadata, making it suitable for use in AI applications.

## Features

* **Flexible Text Extraction:**
* Extract all text from the `<body>` of an HTML document.
* Extract text from specific elements using CSS selectors.
* Group text by element, creating a separate document for each selected element.
* Combine text from multiple selected elements using a configurable separator.
* **Metadata Extraction:**
* Extract the document title.
* Extract content from `<meta>` tags (e.g., description, keywords). You can specify which meta tags to extract.
* Extract a list of all absolute URLs of links (`<a href="...">`) within the document.
* **Configurable:**
* Specify the character encoding (defaults to UTF-8).
* Customize the CSS selector for element selection.
* Configure the separator string for joining text from multiple elements.
* Choose whether to extract all text or use element-based extraction.
* Enable/disable link URL extraction.
* Add additional metadata using configuration.
* **Resource-Based:** Works with Spring's `Resource` abstraction, allowing you to read HTML from files, classpath resources, URLs, and even in-memory byte arrays.

---

#### How to Build:
```bash
./mvnw -pl document-readers/jsoup-reader clean install
```
63 changes: 63 additions & 0 deletions document-readers/jsoup-reader/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Copyright 2025-2025 the original author or authors.
~
~ Licensed under the Apache License, Version 2.0 (the "License");
~ you may not use this file except in compliance with the License.
~ You may obtain a copy of the License at
~
~ https://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->

<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>

<artifactId>spring-ai-jsoup-document-reader</artifactId>
<packaging>jar</packaging>
<name>Spring AI Document Reader - HTML</name>
<description>Spring AI HTML document reader</description>
<url>https://github.com/spring-projects/spring-ai</url>

<scm>
<url>https://github.com/spring-projects/spring-ai</url>
<connection>git://github.com/spring-projects/spring-ai.git</connection>
<developerConnection>[email protected]:spring-projects/spring-ai.git</developerConnection>
</scm>

<dependencies>
<dependency>
<groupId>org.springframework.ai</groupId>
<artifactId>spring-ai-core</artifactId>
<version>${project.parent.version}</version>
</dependency>

<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.18.3</version>
</dependency>

<!-- TESTING -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>

</dependencies>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
/*
* Copyright 2025-2025 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.springframework.ai.reader.jsoup;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import org.springframework.ai.document.Document;
import org.springframework.ai.document.DocumentReader;
import org.springframework.ai.reader.jsoup.config.JsoupDocumentReaderConfig;
import org.springframework.core.io.DefaultResourceLoader;
import org.springframework.core.io.Resource;

/**
* Reads HTML documents and extracts text content using JSoup.
*
* This reader provides options for selecting specific HTML elements to extract, handling
* links, and extracting metadata. It leverages the JSoup library for parsing HTML.
*
* @see <a href="https://jsoup.org/">JSoup Website</a>
* @author Alexandros Pappas
*/
public class JsoupDocumentReader implements DocumentReader {

private final Resource htmlResource;

private final JsoupDocumentReaderConfig config;

public JsoupDocumentReader(String htmlResource) {
this(new DefaultResourceLoader().getResource(htmlResource));
}

public JsoupDocumentReader(Resource htmlResource) {
this(htmlResource, JsoupDocumentReaderConfig.defaultConfig());
}

public JsoupDocumentReader(String htmlResource, JsoupDocumentReaderConfig config) {
this(new DefaultResourceLoader().getResource(htmlResource), config);
}

public JsoupDocumentReader(Resource htmlResource, JsoupDocumentReaderConfig config) {
this.htmlResource = htmlResource;
this.config = config;
}

@Override
public List<Document> get() {
try (InputStream inputStream = htmlResource.getInputStream()) {
org.jsoup.nodes.Document doc = Jsoup.parse(inputStream, this.config.charset, "");

List<Document> documents = new ArrayList<>();

if (this.config.allElements) {
// Extract text from all elements and create a single document
String allText = doc.body().text(); // .body to exclude head
Document document = new Document(allText);
addMetadata(doc, document);
documents.add(document);
}
else if (this.config.groupByElement) {
// Extract text on a per-element base using the defined selector.
Elements selectedElements = doc.select(this.config.selector);
for (Element element : selectedElements) {
String elementText = element.text();
Document document = new Document(elementText);
addMetadata(doc, document);
// Do not add metadata from element to avoid duplication.
documents.add(document);
}
}
else {
// Extract text from specific elements based on the selector
Elements elements = doc.select(this.config.selector);
String text = elements.stream().map(Element::text).collect(Collectors.joining(this.config.separator));
Document document = new Document(text);
addMetadata(doc, document);
documents.add(document);
}

return documents;

}
catch (IOException e) {
throw new RuntimeException("Failed to read HTML resource: " + htmlResource, e);
}
}

private void addMetadata(org.jsoup.nodes.Document jsoupDoc, Document springDoc) {
Map<String, Object> metadata = new HashMap<>();
metadata.put("title", jsoupDoc.title());

for (String metaTag : this.config.metadataTags) {
String value = jsoupDoc.select("meta[name=" + metaTag + "]").attr("content");
if (!value.isEmpty()) {
metadata.put(metaTag, value);
}
}

if (this.config.includeLinkUrls) {
Elements links = jsoupDoc.select("a[href]");
List<String> linkUrls = links.stream().map(link -> link.attr("abs:href")).toList();
metadata.put("linkUrls", linkUrls);
}

// Use putAll to add all entries from additionalMetadata
metadata.putAll(this.config.additionalMetadata);

// Add all collected metadata to the Spring Document
springDoc.getMetadata().putAll(metadata);
}

}
Loading

0 comments on commit 2879e6c

Please sign in to comment.