Skip to content
This repository was archived by the owner on Apr 11, 2025. It is now read-only.

Commit d606d88

Browse files
authored
Add support for parsing PDF pages in parallel (multiprocessing) (#17)
Parse in parallel using multiprocessing library using available CPUs
1 parent 567520b commit d606d88

File tree

6 files changed

+115
-11
lines changed

6 files changed

+115
-11
lines changed

camelot/cli.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,12 @@ def set_config(self, key, value):
3939
default="1",
4040
help="Comma-separated page numbers." " Example: 1,3,4 or 1,4-end or all.",
4141
)
42+
@click.option(
43+
"--parallel",
44+
is_flag=True,
45+
default=False,
46+
help="Read pdf pages in parallel using all CPU cores.",
47+
)
4248
@click.option("-pw", "--password", help="Password for decryption.")
4349
@click.option("-o", "--output", help="Output file path.")
4450
@click.option(

camelot/handlers.py

Lines changed: 61 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import multiprocessing as mp
12
import os
23
import sys
34
from pathlib import Path
@@ -143,7 +144,12 @@ def _save_page(self, filepath: Union[StrByteType, Path], page, temp):
143144
instream.close()
144145

145146
def parse(
146-
self, flavor="lattice", suppress_stdout=False, layout_kwargs=None, **kwargs
147+
self,
148+
flavor="lattice",
149+
suppress_stdout=False,
150+
parallel=False,
151+
layout_kwargs=None,
152+
**kwargs
147153
):
148154
"""Extracts tables by calling parser.get_tables on all single
149155
page PDFs.
@@ -153,8 +159,10 @@ def parse(
153159
flavor : str (default: 'lattice')
154160
The parsing method to use ('lattice' or 'stream').
155161
Lattice is used by default.
156-
suppress_stdout : str (default: False)
162+
suppress_stdout : bool (default: False)
157163
Suppress logs and warnings.
164+
parallel : bool (default: False)
165+
Process pages in parallel using all available cpu cores.
158166
layout_kwargs : dict, optional (default: {})
159167
A dict of `pdfminer.layout.LAParams
160168
<https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
@@ -171,14 +179,56 @@ def parse(
171179
layout_kwargs = {}
172180

173181
tables = []
182+
parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
174183
with TemporaryDirectory() as tempdir:
175-
for p in self.pages:
176-
self._save_page(self.filepath, p, tempdir)
177-
pages = [os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages]
178-
parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
179-
for p in pages:
180-
t = parser.extract_tables(
181-
p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
182-
)
183-
tables.extend(t)
184+
cpu_count = mp.cpu_count()
185+
# Using multiprocessing only when cpu_count > 1 to prevent a stallness issue
186+
# when cpu_count is 1
187+
if parallel and len(self.pages) > 1 and cpu_count > 1:
188+
with mp.get_context("spawn").Pool(processes=cpu_count) as pool:
189+
jobs = []
190+
for p in self.pages:
191+
j = pool.apply_async(
192+
self._parse_page,(p, tempdir, parser, suppress_stdout, layout_kwargs)
193+
)
194+
jobs.append(j)
195+
196+
for j in jobs:
197+
t = j.get()
198+
tables.extend(t)
199+
else:
200+
for p in self.pages:
201+
t = self._parse_page(p, tempdir, parser, suppress_stdout, layout_kwargs)
202+
tables.extend(t)
203+
184204
return TableList(sorted(tables))
205+
206+
def _parse_page(
207+
self, page, tempdir, parser, suppress_stdout, layout_kwargs
208+
):
209+
"""Extracts tables by calling parser.get_tables on a single
210+
page PDF.
211+
212+
Parameters
213+
----------
214+
page : str
215+
Page number to parse
216+
parser : Lattice or Stream
217+
The parser to use (Lattice or Stream).
218+
suppress_stdout : bool
219+
Suppress logs and warnings.
220+
layout_kwargs : dict, optional (default: {})
221+
A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
222+
223+
Returns
224+
-------
225+
tables : camelot.core.TableList
226+
List of tables found in PDF.
227+
228+
"""
229+
self._save_page(self.filepath, page, tempdir)
230+
page_path = os.path.join(tempdir, f"page-{page}.pdf")
231+
tables = parser.extract_tables(
232+
page_path, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
233+
)
234+
return tables

camelot/io.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ def read_pdf(
1515
password=None,
1616
flavor="lattice",
1717
suppress_stdout=False,
18+
parallel=False,
1819
layout_kwargs=None,
1920
**kwargs
2021
):
@@ -37,6 +38,8 @@ def read_pdf(
3738
Lattice is used by default.
3839
suppress_stdout : bool, optional (default: True)
3940
Print all logs and warnings.
41+
parallel : bool, optional (default: False)
42+
Process pages in parallel using all available cpu cores.
4043
layout_kwargs : dict, optional (default: {})
4144
A dict of `pdfminer.layout.LAParams
4245
<https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
@@ -122,6 +125,7 @@ def read_pdf(
122125
tables = p.parse(
123126
flavor=flavor,
124127
suppress_stdout=suppress_stdout,
128+
parallel=parallel,
125129
layout_kwargs=layout_kwargs,
126130
**kwargs
127131
)

docs/user/quickstart.rst

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,26 @@ By default, Camelot only uses the first page of the PDF to extract tables. To sp
9999

100100
The ``pages`` keyword argument accepts pages as comma-separated string of page numbers. You can also specify page ranges — for example, ``pages=1,4-10,20-30`` or ``pages=1,4-10,20-end``.
101101

102+
Extract tables in parallel
103+
--------------------------
104+
105+
Camelot supports extracting tables in parrallel using all the available CPU cores.
106+
107+
::
108+
109+
>>> tables = camelot.read_pdf('foo.pdf', page='all', parallel=True)
110+
>>> tables
111+
<TableList n=1>
112+
113+
.. tip::
114+
Here's how you can do the same with the :ref:`command-line interface <cli>`.
115+
::
116+
117+
$ camelot --pages all --parallel lattice foo.pdf
118+
119+
.. note:: The reading of the PDF document is parallelized by processing pages by different CPU core.
120+
Therefore, a document with a low page count could be slower to process in parallel.
121+
102122
Reading encrypted PDFs
103123
----------------------
104124

tests/files/diesel_engines.pdf

272 KB
Binary file not shown.

tests/test_cli.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,30 @@ def test_cli_stream(testdir):
6262
assert format_error in result.output
6363

6464

65+
@skip_on_windows
66+
def test_cli_parallel(testdir):
67+
with TemporaryDirectory() as tempdir:
68+
infile = os.path.join(testdir, "diesel_engines.pdf")
69+
outfile = os.path.join(tempdir, "diesel_engines.csv")
70+
runner = CliRunner()
71+
result = runner.invoke(
72+
cli,
73+
[
74+
"--parallel",
75+
"--pages",
76+
"1,2,3",
77+
"--format",
78+
"csv",
79+
"--output",
80+
outfile,
81+
"lattice",
82+
infile,
83+
],
84+
)
85+
assert result.exit_code == 0
86+
assert result.output == "Found 2 tables\n"
87+
88+
6589
def test_cli_password(testdir):
6690
with TemporaryDirectory() as tempdir:
6791
infile = os.path.join(testdir, "health_protected.pdf")

0 commit comments

Comments
 (0)