Skip to content

Commit 63835b0

Browse files
authored
Merge pull request #36 from explosion/feature/pipe-as-tuples
Add as_tuples argument to spaCyLayout.pipe
2 parents 5facc6b + 72c995d commit 63835b0

File tree

3 files changed

+60
-9
lines changed

3 files changed

+60
-9
lines changed

README.md

+4-3
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ doc = layout("./starcraft.pdf")
205205

206206
#### <kbd>method</kbd> `spaCyLayout.pipe`
207207

208-
Process multiple documents and create spaCy [`Doc`](https://spacy.io/api/doc) objects. You should use this method if you're processing larger volumes of documents at scale.
208+
Process multiple documents and create spaCy [`Doc`](https://spacy.io/api/doc) objects. You should use this method if you're processing larger volumes of documents at scale. The behavior of `as_tuples` works like it does in spaCy's [`Language.pipe`](https://spacy.io/api/language#pipe).
209209

210210
```python
211211
layout = spaCyLayout(nlp)
@@ -215,8 +215,9 @@ docs = layout.pipe(paths)
215215

216216
| Argument | Type | Description |
217217
| --- | --- | --- |
218-
| `sources` | `Iterable[str \| Path \| bytes]` | Paths of documents to process or bytes. |
219-
| **YIELDS** | `Doc` | The processed spaCy `Doc` object. |
218+
| `sources` | `Iterable[str \| Path \| bytes] \| Iterable[tuple[str \| Path \| bytes, Any]]` | Paths of documents to process or bytes, or `(source, context)` tuples if `as_tuples` is set to `True`. |
219+
| `as_tuples` | `bool` | If set to `True`, inputs should be an iterable of `(source, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. |
220+
| **YIELDS** | `Doc \| tuple[Doc, Any]` | The processed spaCy `Doc` objects or `(doc, context)` tuples if `as_tuples` is set to `True`. |
220221

221222
## 💡 Examples and code snippets
222223

spacy_layout/layout.py

+47-6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,15 @@
11
from io import BytesIO
22
from pathlib import Path
3-
from typing import TYPE_CHECKING, Callable, Iterable, Iterator
3+
from typing import (
4+
TYPE_CHECKING,
5+
Callable,
6+
Iterable,
7+
Iterator,
8+
Literal,
9+
TypeVar,
10+
cast,
11+
overload,
12+
)
413

514
import srsly
615
from docling.datamodel.base_models import DocumentStream
@@ -18,6 +27,8 @@
1827
from pandas import DataFrame
1928
from spacy.language import Language
2029

30+
# Type variable for contexts piped with documents
31+
_AnyContext = TypeVar("_AnyContext")
2132

2233
TABLE_PLACEHOLDER = "TABLE"
2334
TABLE_ITEM_LABELS = [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX]
@@ -76,12 +87,42 @@ def __call__(self, source: str | Path | bytes | DoclingDocument) -> Doc:
7687
result = self.converter.convert(self._get_source(source)).document
7788
return self._result_to_doc(result)
7889

79-
def pipe(self, sources: Iterable[str | Path | bytes]) -> Iterator[Doc]:
90+
@overload
91+
def pipe(
92+
self,
93+
sources: Iterable[str | Path | bytes],
94+
as_tuples: Literal[False] = ...,
95+
) -> Iterator[Doc]: ...
96+
97+
@overload
98+
def pipe(
99+
self,
100+
sources: Iterable[tuple[str | Path | bytes, _AnyContext]],
101+
as_tuples: Literal[True] = ...,
102+
) -> Iterator[tuple[Doc, _AnyContext]]: ...
103+
104+
def pipe(
105+
self,
106+
sources: (
107+
Iterable[str | Path | bytes]
108+
| Iterable[tuple[str | Path | bytes, _AnyContext]]
109+
),
110+
as_tuples: bool = False,
111+
) -> Iterator[Doc] | Iterator[tuple[Doc, _AnyContext]]:
80112
"""Process multiple documents and create spaCy Doc objects."""
81-
data = (self._get_source(source) for source in sources)
82-
results = self.converter.convert_all(data)
83-
for result in results:
84-
yield self._result_to_doc(result.document)
113+
if as_tuples:
114+
sources = cast(Iterable[tuple[str | Path | bytes, _AnyContext]], sources)
115+
data = (self._get_source(source) for source, _ in sources)
116+
contexts = (context for _, context in sources)
117+
results = self.converter.convert_all(data)
118+
for result, context in zip(results, contexts):
119+
yield (self._result_to_doc(result.document), context)
120+
else:
121+
sources = cast(Iterable[str | Path | bytes], sources)
122+
data = (self._get_source(source) for source in sources)
123+
results = self.converter.convert_all(data)
124+
for result in results:
125+
yield self._result_to_doc(result.document)
85126

86127
def _get_source(self, source: str | Path | bytes) -> str | Path | DocumentStream:
87128
if isinstance(source, (str, Path)):

tests/test_general.py

+9
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,15 @@ def test_simple_pipe(nlp):
7474
assert len(doc.spans[layout.attrs.span_group]) == 4
7575

7676

77+
def test_simple_pipe_as_tuples(nlp):
78+
layout = spaCyLayout(nlp)
79+
data = [(PDF_SIMPLE, "pdf"), (DOCX_SIMPLE, "docx")]
80+
result = list(layout.pipe(data, as_tuples=True))
81+
for doc, _ in result:
82+
assert len(doc.spans[layout.attrs.span_group]) == 4
83+
assert [context for _, context in result] == ["pdf", "docx"]
84+
85+
7786
def test_table(nlp):
7887
layout = spaCyLayout(nlp)
7988
doc = layout(PDF_TABLE)

0 commit comments

Comments
 (0)