diff --git a/document-readers/jsoup-reader/ README.md b/document-readers/jsoup-reader/ README.md
new file mode 100644
index 00000000000..9f7fa7984b7
--- /dev/null
+++ b/document-readers/jsoup-reader/ README.md
@@ -0,0 +1,30 @@
+# Spring AI JSoup Document Reader
+
+This module provides an HTML document reader for the Spring AI project. It leverages the [JSoup](https://jsoup.org/) library to parse HTML content and extract text and metadata, making it suitable for use in AI applications.
+
+## Features
+
+* **Flexible Text Extraction:**
+ * Extract all text from the `
` of an HTML document.
+ * Extract text from specific elements using CSS selectors.
+ * Group text by element, creating a separate document for each selected element.
+ * Combine text from multiple selected elements using a configurable separator.
+* **Metadata Extraction:**
+ * Extract the document title.
+ * Extract content from `` tags (e.g., description, keywords). You can specify which meta tags to extract.
+ * Extract a list of all absolute URLs of links (``) within the document.
+* **Configurable:**
+ * Specify the character encoding (defaults to UTF-8).
+ * Customize the CSS selector for element selection.
+ * Configure the separator string for joining text from multiple elements.
+ * Choose whether to extract all text or use element-based extraction.
+ * Enable/disable link URL extraction.
+ * Add additional metadata using configuration.
+* **Resource-Based:** Works with Spring's `Resource` abstraction, allowing you to read HTML from files, classpath resources, URLs, and even in-memory byte arrays.
+
+---
+
+#### How to Build:
+```bash
+./mvnw -pl document-readers/jsoup-reader clean install
+```
\ No newline at end of file
diff --git a/document-readers/jsoup-reader/pom.xml b/document-readers/jsoup-reader/pom.xml
new file mode 100644
index 00000000000..472a8e720a7
--- /dev/null
+++ b/document-readers/jsoup-reader/pom.xml
@@ -0,0 +1,63 @@
+
+
+
+
+ 4.0.0
+
+ org.springframework.ai
+ spring-ai
+ 1.0.0-SNAPSHOT
+ ../../pom.xml
+
+
+ spring-ai-jsoup-document-reader
+ jar
+ Spring AI Document Reader - HTML
+ Spring AI HTML document reader
+ https://github.com/spring-projects/spring-ai
+
+
+ https://github.com/spring-projects/spring-ai
+ git://github.com/spring-projects/spring-ai.git
+ git@github.com:spring-projects/spring-ai.git
+
+
+
+
+ org.springframework.ai
+ spring-ai-core
+ ${project.parent.version}
+
+
+
+ org.jsoup
+ jsoup
+ 1.18.3
+
+
+
+
+ org.springframework.boot
+ spring-boot-starter-test
+ test
+
+
+
+
+
diff --git a/document-readers/jsoup-reader/src/main/java/org/springframework/ai/reader/jsoup/JsoupDocumentReader.java b/document-readers/jsoup-reader/src/main/java/org/springframework/ai/reader/jsoup/JsoupDocumentReader.java
new file mode 100644
index 00000000000..e3fd7989d90
--- /dev/null
+++ b/document-readers/jsoup-reader/src/main/java/org/springframework/ai/reader/jsoup/JsoupDocumentReader.java
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2025-2025 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.springframework.ai.reader.jsoup;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+import org.springframework.ai.document.Document;
+import org.springframework.ai.document.DocumentReader;
+import org.springframework.ai.reader.jsoup.config.JsoupDocumentReaderConfig;
+import org.springframework.core.io.DefaultResourceLoader;
+import org.springframework.core.io.Resource;
+
+/**
+ * Reads HTML documents and extracts text content using JSoup.
+ *
+ * This reader provides options for selecting specific HTML elements to extract, handling
+ * links, and extracting metadata. It leverages the JSoup library for parsing HTML.
+ *
+ * @see JSoup Website
+ * @author Alexandros Pappas
+ */
+public class JsoupDocumentReader implements DocumentReader {
+
+ private final Resource htmlResource;
+
+ private final JsoupDocumentReaderConfig config;
+
+ public JsoupDocumentReader(String htmlResource) {
+ this(new DefaultResourceLoader().getResource(htmlResource));
+ }
+
+ public JsoupDocumentReader(Resource htmlResource) {
+ this(htmlResource, JsoupDocumentReaderConfig.defaultConfig());
+ }
+
+ public JsoupDocumentReader(String htmlResource, JsoupDocumentReaderConfig config) {
+ this(new DefaultResourceLoader().getResource(htmlResource), config);
+ }
+
+ public JsoupDocumentReader(Resource htmlResource, JsoupDocumentReaderConfig config) {
+ this.htmlResource = htmlResource;
+ this.config = config;
+ }
+
+ @Override
+ public List get() {
+ try (InputStream inputStream = htmlResource.getInputStream()) {
+ org.jsoup.nodes.Document doc = Jsoup.parse(inputStream, this.config.charset, "");
+
+ List documents = new ArrayList<>();
+
+ if (this.config.allElements) {
+ // Extract text from all elements and create a single document
+ String allText = doc.body().text(); // .body to exclude head
+ Document document = new Document(allText);
+ addMetadata(doc, document);
+ documents.add(document);
+ }
+ else if (this.config.groupByElement) {
+ // Extract text on a per-element base using the defined selector.
+ Elements selectedElements = doc.select(this.config.selector);
+ for (Element element : selectedElements) {
+ String elementText = element.text();
+ Document document = new Document(elementText);
+ addMetadata(doc, document);
+ // Do not add metadata from element to avoid duplication.
+ documents.add(document);
+ }
+ }
+ else {
+ // Extract text from specific elements based on the selector
+ Elements elements = doc.select(this.config.selector);
+ String text = elements.stream().map(Element::text).collect(Collectors.joining(this.config.separator));
+ Document document = new Document(text);
+ addMetadata(doc, document);
+ documents.add(document);
+ }
+
+ return documents;
+
+ }
+ catch (IOException e) {
+ throw new RuntimeException("Failed to read HTML resource: " + htmlResource, e);
+ }
+ }
+
+ private void addMetadata(org.jsoup.nodes.Document jsoupDoc, Document springDoc) {
+ Map metadata = new HashMap<>();
+ metadata.put("title", jsoupDoc.title());
+
+ for (String metaTag : this.config.metadataTags) {
+ String value = jsoupDoc.select("meta[name=" + metaTag + "]").attr("content");
+ if (!value.isEmpty()) {
+ metadata.put(metaTag, value);
+ }
+ }
+
+ if (this.config.includeLinkUrls) {
+ Elements links = jsoupDoc.select("a[href]");
+ List linkUrls = links.stream().map(link -> link.attr("abs:href")).toList();
+ metadata.put("linkUrls", linkUrls);
+ }
+
+ // Use putAll to add all entries from additionalMetadata
+ metadata.putAll(this.config.additionalMetadata);
+
+ // Add all collected metadata to the Spring Document
+ springDoc.getMetadata().putAll(metadata);
+ }
+
+}
diff --git a/document-readers/jsoup-reader/src/main/java/org/springframework/ai/reader/jsoup/config/JsoupDocumentReaderConfig.java b/document-readers/jsoup-reader/src/main/java/org/springframework/ai/reader/jsoup/config/JsoupDocumentReaderConfig.java
new file mode 100644
index 00000000000..c6f1cab7ffb
--- /dev/null
+++ b/document-readers/jsoup-reader/src/main/java/org/springframework/ai/reader/jsoup/config/JsoupDocumentReaderConfig.java
@@ -0,0 +1,207 @@
+/*
+ * Copyright 2025-2025 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.springframework.ai.reader.jsoup.config;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.springframework.ai.reader.jsoup.JsoupDocumentReader;
+import org.springframework.util.Assert;
+
+/**
+ * Common configuration for the {@link JsoupDocumentReader}.
+ *
+ * Provides options for specifying the character encoding, CSS selector, text separator,
+ * and whether to extract all text from the body or specific elements, and handling link
+ * extraction.
+ *
+ * @author Alexandros Pappas
+ */
+public class JsoupDocumentReaderConfig {
+
+ public final String charset;
+
+ public final String selector;
+
+ public final String separator;
+
+ public final boolean allElements;
+
+ public final boolean groupByElement;
+
+ public final boolean includeLinkUrls;
+
+ public final List metadataTags;
+
+ public final Map additionalMetadata;
+
+ private JsoupDocumentReaderConfig(Builder builder) {
+ this.charset = builder.charset;
+ this.selector = builder.selector;
+ this.separator = builder.separator;
+ this.allElements = builder.allElements;
+ this.includeLinkUrls = builder.includeLinkUrls;
+ this.metadataTags = builder.metadataTags;
+ this.groupByElement = builder.groupByElement;
+ this.additionalMetadata = builder.additionalMetadata;
+ }
+
+ public static Builder builder() {
+ return new Builder();
+ }
+
+ public static JsoupDocumentReaderConfig defaultConfig() {
+ return builder().build();
+ }
+
+ public static class Builder {
+
+ private String charset = "UTF-8";
+
+ private String selector = "body";
+
+ private String separator = "\n";
+
+ private boolean allElements = false;
+
+ private boolean includeLinkUrls = false;
+
+ private List metadataTags = new ArrayList<>(List.of("description", "keywords"));
+
+ private boolean groupByElement = false;
+
+ private Map additionalMetadata = new HashMap<>();
+
+ private Builder() {
+ }
+
+ /**
+ * Sets the character encoding to use for reading the HTML. Defaults to UTF-8.
+ * @param charset The charset to use.
+ * @return This builder.
+ */
+ public Builder charset(String charset) {
+ this.charset = charset;
+ return this;
+ }
+
+ /**
+ * Sets the CSS selector to use for extracting elements. Defaults to "body".
+ * @param selector The CSS selector.
+ * @return This builder.
+ */
+ public Builder selector(String selector) {
+ this.selector = selector;
+ return this;
+ }
+
+ /**
+ * Sets the separator string to use when joining text from multiple elements.
+ * Defaults to "\n".
+ * @param separator The separator string.
+ * @return This builder.
+ */
+ public Builder separator(String separator) {
+ this.separator = separator;
+ return this;
+ }
+
+ /**
+ * Enables extracting text from all elements in the body, creating a single
+ * document. Overrides the selector setting. Defaults to false.
+ * @param allElements True to extract all text, false otherwise.
+ * @return This builder.
+ */
+ public Builder allElements(boolean allElements) {
+ this.allElements = allElements;
+ return this;
+ }
+
+ /**
+ * Determines if on the selected element, the content will be read on per-element
+ * base.
+ * @param groupByElement to read text using element as a separator.
+ * @return this builder.
+ */
+ public Builder groupByElement(boolean groupByElement) {
+ this.groupByElement = groupByElement;
+ return this;
+ }
+
+ /**
+ * Enables the inclusion of link URLs in the document metadata. Defaults to false.
+ * @param includeLinkUrls True to include link URLs, false otherwise.
+ * @return This builder.
+ */
+ public Builder includeLinkUrls(boolean includeLinkUrls) {
+ this.includeLinkUrls = includeLinkUrls;
+ return this;
+ }
+
+ /**
+ * Adds a metadata tag name to extract from the HTML tags.
+ * @param metadataTag The name of the metadata tag.
+ * @return This builder.
+ */
+ public Builder metadataTag(String metadataTag) {
+ this.metadataTags.add(metadataTag);
+ return this;
+ }
+
+ /**
+ * Sets the metadata tags to extract from the HTML tags. Overwrites any
+ * previously added tags.
+ * @param metadataTags The list of metadata tag names.
+ * @return This builder.
+ */
+ public Builder metadataTags(List metadataTags) {
+ this.metadataTags = new ArrayList<>(metadataTags);
+ return this;
+ }
+
+ /**
+ * Adds this additional metadata to the all built
+ * {@link org.springframework.ai.document.Document}s.
+ * @return this builder
+ */
+ public Builder additionalMetadata(String key, Object value) {
+ Assert.notNull(key, "key must not be null");
+ Assert.notNull(value, "value must not be null");
+ this.additionalMetadata.put(key, value);
+ return this;
+ }
+
+ /**
+ * Adds this additional metadata to the all built
+ * {@link org.springframework.ai.document.Document}s.
+ * @return this builder
+ */
+ public Builder additionalMetadata(Map additionalMetadata) {
+ Assert.notNull(additionalMetadata, "additionalMetadata must not be null");
+ this.additionalMetadata = additionalMetadata;
+ return this;
+ }
+
+ public JsoupDocumentReaderConfig build() {
+ return new JsoupDocumentReaderConfig(this);
+ }
+
+ }
+
+}
diff --git a/document-readers/jsoup-reader/src/test/java/org/springframework/ai/reader/jsoup/JsoupDocumentReaderTests.java b/document-readers/jsoup-reader/src/test/java/org/springframework/ai/reader/jsoup/JsoupDocumentReaderTests.java
new file mode 100644
index 00000000000..c6b7c3c3846
--- /dev/null
+++ b/document-readers/jsoup-reader/src/test/java/org/springframework/ai/reader/jsoup/JsoupDocumentReaderTests.java
@@ -0,0 +1,186 @@
+/*
+ * Copyright 2025-2025 the original author or authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.springframework.ai.reader.jsoup;
+
+import java.util.List;
+
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
+
+import org.springframework.ai.document.Document;
+import org.springframework.ai.reader.jsoup.config.JsoupDocumentReaderConfig;
+import org.springframework.core.io.ByteArrayResource;
+import org.springframework.core.io.DefaultResourceLoader;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/**
+ * Tests for {@link JsoupDocumentReader}.
+ *
+ * @author Alexandros Pappas
+ */
+class JsoupDocumentReaderTests {
+
+ @Test
+ void testSimpleRead() {
+ JsoupDocumentReader reader = new JsoupDocumentReader("classpath:/test.html");
+ List documents = reader.get();
+ assertThat(documents).hasSize(1);
+ Document document = documents.get(0);
+ assertThat(document.getText()).contains("This is a test HTML document.");
+ assertThat(document.getText()).contains("Some paragraph text.");
+ assertThat(document.getMetadata()).containsEntry("title", "Test HTML");
+ assertThat(document.getMetadata()).containsEntry("description", "A test document for Spring AI");
+ assertThat(document.getMetadata()).containsEntry("keywords", "test,html,spring ai");
+ }
+
+ @Test
+ void testSimpleReadWithAdditionalMetadata() {
+ JsoupDocumentReader reader = new JsoupDocumentReader("classpath:/test.html",
+ JsoupDocumentReaderConfig.builder().additionalMetadata("key", "value").build());
+ List documents = reader.get();
+ assertThat(documents).hasSize(1);
+ Document document = documents.get(0);
+ assertThat(document.getMetadata()).containsEntry("key", "value");
+ }
+
+ @Test
+ void testSelector() {
+ JsoupDocumentReader reader = new JsoupDocumentReader("classpath:/test.html",
+ JsoupDocumentReaderConfig.builder().selector("p").build());
+ List documents = reader.get();
+ assertThat(documents).hasSize(1);
+ assertThat(documents.get(0).getText()).isEqualTo("Some paragraph text.");
+ }
+
+ @Test
+ void testAllElements() {
+ JsoupDocumentReader reader = new JsoupDocumentReader(
+ new DefaultResourceLoader().getResource("classpath:/test.html"),
+ JsoupDocumentReaderConfig.builder().allElements(true).build());
+ List documents = reader.get();
+ assertThat(documents).hasSize(1);
+ Document document = documents.get(0);
+ assertThat(document.getText()).contains("This is a test HTML document.");
+ assertThat(document.getText()).contains("Some paragraph text.");
+ }
+
+ @Test
+ void testWithLinkUrls() {
+ JsoupDocumentReader reader = new JsoupDocumentReader(
+ new DefaultResourceLoader().getResource("classpath:/test.html"),
+ JsoupDocumentReaderConfig.builder().includeLinkUrls(true).build());
+ List documents = reader.get();
+ assertThat(documents).hasSize(1);
+ Document document = documents.get(0);
+
+ assertThat(document.getMetadata()).containsKey("linkUrls");
+
+ List linkUrls = (List) document.getMetadata().get("linkUrls");
+ assertThat(linkUrls).contains("https://spring.io/");
+ }
+
+ @Test
+ void testWithMetadataTags() {
+ JsoupDocumentReader reader = new JsoupDocumentReader(
+ new DefaultResourceLoader().getResource("classpath:/test.html"),
+ JsoupDocumentReaderConfig.builder().metadataTags(List.of("custom1", "custom2")).build());
+ List documents = reader.get();
+ assertThat(documents).hasSize(1);
+ Document document = documents.get(0);
+ assertThat(document.getMetadata()).containsKeys("custom1", "custom2");
+ assertThat(document.getMetadata().get("custom1")).isEqualTo("value1");
+ assertThat(document.getMetadata().get("custom2")).isEqualTo("value2");
+ }
+
+ @Test
+ void testWithGroupByElement() {
+ JsoupDocumentReader reader = new JsoupDocumentReader(
+ new DefaultResourceLoader().getResource("classpath:/test-group-by.html"),
+ JsoupDocumentReaderConfig.builder().groupByElement(true).selector("section").build());
+ List documents = reader.get();
+ assertThat(documents).hasSize(2);
+ assertThat(documents.get(0).getText()).isEqualTo("Section 1 content");
+ assertThat(documents.get(1).getText()).isEqualTo("Section 2 content");
+ }
+
+ @Test
+ @Disabled("This test requires an active internet connection")
+ void testWikipediaHeadlines() {
+ // Use a URL resource instead of classpath:
+ JsoupDocumentReader reader = new JsoupDocumentReader("https://en.wikipedia.org/",
+ JsoupDocumentReaderConfig.builder().selector("#mp-itn b a").includeLinkUrls(true).build());
+
+ List documents = reader.get();
+ assertThat(documents).hasSize(1);
+ Document document = documents.get(0);
+
+ // Check for *some* content - we don't want to hard-code specific headlines
+ // as they will change. This verifies the selector is working.
+ assertThat(document.getText()).isNotEmpty();
+
+ // Check if the metadata contains any links
+ assertThat(document.getMetadata()).containsKey("linkUrls");
+ assertThat(document.getMetadata().get("linkUrls")).isInstanceOf(List.class);
+ }
+
+ @Test
+ void testParseFromString() {
+ String html = "First parse"
+ + "
Parsed HTML into a doc.
";
+
+ // Decode the base64 string and create a ByteArrayResource
+ byte[] htmlBytes = html.getBytes();
+ ByteArrayResource byteArrayResource = new ByteArrayResource(htmlBytes);
+
+ JsoupDocumentReader reader = new JsoupDocumentReader(byteArrayResource,
+ JsoupDocumentReaderConfig.builder().build());
+
+ List documents = reader.get();
+ assertThat(documents).hasSize(1);
+ Document doc = documents.get(0);
+ assertThat(doc.getText()).isEqualTo("Parsed HTML into a doc.");
+ assertThat(doc.getMetadata()).containsEntry("title", "First parse");
+ }
+
+ @Test
+ void testParseBodyFragment() {
+ String html = "
Lorem ipsum.
";
+
+ // Decode the base64 string and create a ByteArrayResource
+ byte[] htmlBytes = html.getBytes();
+ ByteArrayResource byteArrayResource = new ByteArrayResource(htmlBytes);
+
+ JsoupDocumentReader reader = new JsoupDocumentReader(byteArrayResource,
+ JsoupDocumentReaderConfig.builder()
+ .selector("div") // Select the div
+ .build());
+
+ List documents = reader.get();
+ assertThat(documents).hasSize(1);
+ assertThat(documents.get(0).getText()).isEqualTo("Lorem ipsum.");
+ }
+
+ @Test
+ void testNonExistingUrl() {
+ JsoupDocumentReader reader = new JsoupDocumentReader("https://nonexistingurl.com",
+ JsoupDocumentReaderConfig.builder().build());
+ assertThatThrownBy(reader::get).isInstanceOf(RuntimeException.class);
+ }
+
+}
diff --git a/document-readers/jsoup-reader/src/test/resources/test-group-by.html b/document-readers/jsoup-reader/src/test/resources/test-group-by.html
new file mode 100644
index 00000000000..9f6884c6603
--- /dev/null
+++ b/document-readers/jsoup-reader/src/test/resources/test-group-by.html
@@ -0,0 +1,14 @@
+
+
+
+ Group By Element Test
+
+
+
+
Section 1 content
+
+
+
Section 2 content
+
+
+
\ No newline at end of file
diff --git a/document-readers/jsoup-reader/src/test/resources/test.html b/document-readers/jsoup-reader/src/test/resources/test.html
new file mode 100644
index 00000000000..410995bd136
--- /dev/null
+++ b/document-readers/jsoup-reader/src/test/resources/test.html
@@ -0,0 +1,15 @@
+
+
+
+ Test HTML
+
+
+
+
+
+
+
This is a test HTML document.
+
Some paragraph text.
+Spring
+
+
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index 7e6edfef4fe..6c1f4ac568b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -45,6 +45,7 @@
spring-ai-spring-boot-testcontainersspring-ai-spring-cloud-bindings
+ document-readers/jsoup-readerdocument-readers/markdown-readerdocument-readers/pdf-readerdocument-readers/tika-reader
diff --git a/spring-ai-bom/pom.xml b/spring-ai-bom/pom.xml
index 093caee4dc5..afba258b666 100644
--- a/spring-ai-bom/pom.xml
+++ b/spring-ai-bom/pom.xml
@@ -61,6 +61,12 @@
+
+ org.springframework.ai
+ spring-ai-jsoup-document-reader
+ ${project.version}
+
+
org.springframework.aispring-ai-markdown-document-reader
diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/etl-pipeline.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/etl-pipeline.adoc
index 00519349ba4..78596e3773c 100644
--- a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/etl-pipeline.adoc
+++ b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/etl-pipeline.adoc
@@ -289,6 +289,101 @@ List splitDocuments = new TokenTextSplitter().apply(this.documents);
* Custom metadata can be added to all documents created by the reader using the `getCustomMetadata()` method.
+=== HTML (JSoup)
+
+The `JsoupDocumentReader` processes HTML documents, converting them into a list of `Document` objects using the JSoup library.
+
+==== Example
+
+[source,java]
+----
+@Component
+class MyHtmlReader {
+
+ private final Resource resource;
+
+ MyHtmlReader(@Value("classpath:/my-page.html") Resource resource) {
+ this.resource = resource;
+ }
+
+ List loadHtml() {
+ JsoupDocumentReaderConfig config = JsoupDocumentReaderConfig.builder()
+ .selector("article p") // Extract paragraphs within tags
+ .charset("ISO-8859-1") // Use ISO-8859-1 encoding
+ .includeLinkUrls(true) // Include link URLs in metadata
+ .metadataTags(List.of("author", "date")) // Extract author and date meta tags
+ .additionalMetadata("source", "my-page.html") // Add custom metadata
+ .build();
+
+ JsoupDocumentReader reader = new JsoupDocumentReader(this.resource, config);
+ return reader.get();
+ }
+}
+----
+
+The `JsoupDocumentReaderConfig` allows you to customize the behavior of the `JsoupDocumentReader`:
+
+* `charset`: Specifies the character encoding of the HTML document (defaults to "UTF-8").
+* `selector`: A JSoup CSS selector to specify which elements to extract text from (defaults to "body").
+* `separator`: The string used to join text from multiple selected elements (defaults to "\n").
+* `allElements`: If `true`, extracts all text from the `` element, ignoring the `selector` (defaults to `false`).
+* `groupByElement`: If `true`, creates a separate `Document` for each element matched by the `selector` (defaults to `false`).
+* `includeLinkUrls`: If `true`, extracts absolute link URLs and adds them to the metadata (defaults to `false`).
+* `metadataTags`: A list of `` tag names to extract content from (defaults to `["description", "keywords"]`).
+* `additionalMetadata`: Allows you to add custom metadata to all created `Document` objects.
+
+==== Sample Document: my-page.html
+
+[source,html]
+----
+
+
+
+
+ My Web Page
+
+
+
+
+
+
+
+
+
Welcome to My Page
+
+
+
+
Main Content
+
This is the main content of my web page.
+
It contains multiple paragraphs.
+ External Link
+
+
+
+
+----
+
+Behavior:
+
+The `JsoupDocumentReader` processes the HTML content and creates `Document` objects based on the configuration:
+
+* The `selector` determines which elements are used for text extraction.
+* If `allElements` is `true`, all text within the `` is extracted into a single `Document`.
+* If `groupByElement` is `true`, each element matching the `selector` creates a separate `Document`.
+* If neither `allElements` nor `groupByElement` is `true`, text from all elements matching the `selector` is joined using the `separator`.
+* The document title, content from specified `` tags, and (optionally) link URLs are added to the `Document` metadata.
+* The base URI, for resolving relative links, will be extracted from URL resources.
+
+The reader preserves the text content of the selected elements, but removes any HTML tags within them.
+
+
=== Markdown
The `MarkdownDocumentReader` processes Markdown documents, converting them into a list of `Document` objects.
diff --git a/spring-ai-integration-tests/pom.xml b/spring-ai-integration-tests/pom.xml
index b3cbb4cde7d..948d8b62477 100644
--- a/spring-ai-integration-tests/pom.xml
+++ b/spring-ai-integration-tests/pom.xml
@@ -68,6 +68,13 @@
test
+
+ org.springframework.ai
+ spring-ai-jsoup-document-reader
+ ${project.parent.version}
+ test
+
+
org.springframework.aispring-ai-markdown-document-reader