Add support for parsing PDFs in parallel

Parse in parallel using multiprocessing library using available CPUs
2021-05-01 16:20:27 +02:00 · 2021-05-01 16:20:27 +02:00 · 63161fe379
parent 7709e58d64
commit 63161fe379
4 changed files with 202 additions and 88 deletions
--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@ -2,6 +2,7 @@
 import os
 import sys
 import multiprocessing as mp
 from PyPDF2 import PdfFileReader, PdfFileWriter
@ -140,7 +141,12 @@ class PDFHandler(object):
                instream.close()
    def parse(
-        self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
+        self,
        flavor="lattice",
        suppress_stdout=False,
        parallel=False,
        layout_kwargs={},
        **kwargs
    ):
        """Extracts tables by calling parser.get_tables on all single
        page PDFs.
@ -150,8 +156,10 @@ class PDFHandler(object):
        flavor : str (default: 'lattice')
            The parsing method to use ('lattice' or 'stream').
            Lattice is used by default.
-        suppress_stdout : str (default: False)
+        suppress_stdout : bool (default: False)
            Suppress logs and warnings.
        parallel : bool (default: False)
            Process pages in parallel using all available cpu cores.
        layout_kwargs : dict, optional (default: {})
            A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
        kwargs : dict
@ -164,16 +172,54 @@ class PDFHandler(object):
        """
        tables = []
        with TemporaryDirectory() as tempdir:
            for p in self.pages:
                self._save_page(self.filepath, p, tempdir)
            pages = [
                os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages
            ]
        parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
-            for p in pages:
+        with TemporaryDirectory() as tempdir:
-                t = parser.extract_tables(
+            cpu_count = mp.cpu_count()
-                    p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
+            # Using multiprocessing only when cpu_count > 1 to prevent
-                )
+            # a stallness issue when cpu_count is 1
            if parallel and cpu_count > 1:
                with mp.get_context("spawn").Pool(processes=cpu_count) as pool:
                    jobs = [
                        pool.apply_async(
                            self._parse_page,
                            (p, tempdir, parser, suppress_stdout, layout_kwargs)
                        ) for p in self.pages
                    ]
                    for j in jobs:
                        t = j.get()
                        tables.extend(t)
            else:
                for p in self.pages:
                    t = self._parse_page(p, tempdir, parser, suppress_stdout, layout_kwargs)
                    tables.extend(t)
        return TableList(sorted(tables))
    def _parse_page(
        self, page, tempdir, parser, suppress_stdout, layout_kwargs
    ):
        """Extracts tables by calling parser.get_tables on a single
        page PDF.
        Parameters
        ----------
        page : str
            Page number to parse
        parser : Lattice or Stream
            The parser to use (Lattice or Stream).
        suppress_stdout : bool
            Suppress logs and warnings.
        layout_kwargs : dict, optional (default: {})
            A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
        Returns
        -------
        tables : camelot.core.TableList
            List of tables found in PDF.
        """
        self._save_page(self.filepath, page, tempdir)
        page_path = os.path.join(tempdir, f"page-{page}.pdf")
        tables = parser.extract_tables(
            page_path, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
        )
        return tables
--- a/camelot/io.py
+++ b/camelot/io.py
@ -12,6 +12,7 @@ def read_pdf(
    password=None,
    flavor="lattice",
    suppress_stdout=False,
    parallel=False,
    layout_kwargs={},
    **kwargs
 ):
@ -34,6 +35,8 @@ def read_pdf(
        Lattice is used by default.
    suppress_stdout : bool, optional (default: True)
        Print all logs and warnings.
    parallel : bool, optional (default: False)
        Process pages in parallel using all available cpu cores.
    layout_kwargs : dict, optional (default: {})
        A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
    table_areas : list, optional (default: None)
@ -113,6 +116,7 @@ def read_pdf(
        tables = p.parse(
            flavor=flavor,
            suppress_stdout=suppress_stdout,
            parallel=parallel,
            layout_kwargs=layout_kwargs,
            **kwargs
        )
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -0,0 +1,8 @@
 import pytest
 def pytest_generate_tests(metafunc):
    if "parallel" in metafunc.fixturenames:
        metafunc.parametrize("parallel", [
            pytest.param(True, id="parallel=True"),
            pytest.param(False, id="parallel=False")
        ])
--- a/tests/test_common.py
+++ b/tests/test_common.py
@ -14,219 +14,275 @@ from .data import *
 testdir = os.path.dirname(os.path.abspath(__file__))
 testdir = os.path.join(testdir, "files")
-
+def test_parsing_report(parallel):
 def test_parsing_report():
    parsing_report = {"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1}
    filename = os.path.join(testdir, "foo.pdf")
-    tables = camelot.read_pdf(filename)
+    tables = camelot.read_pdf(filename, parallel=parallel)
    assert tables[0].parsing_report == parsing_report
-def test_password():
+
 def test_password(parallel):
    df = pd.DataFrame(data_stream)
    filename = os.path.join(testdir, "health_protected.pdf")
-    tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream")
+    tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream", parallel=parallel)
    assert_frame_equal(df, tables[0].df)
-    tables = camelot.read_pdf(filename, password="userpass", flavor="stream")
+    tables = camelot.read_pdf(filename, password="userpass", flavor="stream", parallel=parallel)
    assert_frame_equal(df, tables[0].df)
-def test_stream():
+
 def test_stream(parallel):
    df = pd.DataFrame(data_stream)
    filename = os.path.join(testdir, "health.pdf")
-    tables = camelot.read_pdf(filename, flavor="stream")
+    tables = camelot.read_pdf(filename, flavor="stream", parallel=parallel)
    assert_frame_equal(df, tables[0].df)
-def test_stream_table_rotated():
+
 def test_stream_table_rotated(parallel):
    df = pd.DataFrame(data_stream_table_rotated)
    filename = os.path.join(testdir, "clockwise_table_2.pdf")
-    tables = camelot.read_pdf(filename, flavor="stream")
+    tables = camelot.read_pdf(filename, flavor="stream", parallel=parallel)
    assert_frame_equal(df, tables[0].df)
    filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
-    tables = camelot.read_pdf(filename, flavor="stream")
+    tables = camelot.read_pdf(filename, flavor="stream", parallel=parallel)
    assert_frame_equal(df, tables[0].df)
-def test_stream_two_tables():
+
 def test_stream_two_tables(parallel):
    df1 = pd.DataFrame(data_stream_two_tables_1)
    df2 = pd.DataFrame(data_stream_two_tables_2)
    filename = os.path.join(testdir, "tabula/12s0324.pdf")
-    tables = camelot.read_pdf(filename, flavor="stream")
+    tables = camelot.read_pdf(filename, flavor="stream", parallel=parallel)
    assert len(tables) == 2
    assert df1.equals(tables[0].df)
    assert df2.equals(tables[1].df)
-def test_stream_table_regions():
+
 def test_stream_table_regions(parallel):
    df = pd.DataFrame(data_stream_table_areas)
    filename = os.path.join(testdir, "tabula/us-007.pdf")
    tables = camelot.read_pdf(
-        filename, flavor="stream", table_regions=["320,460,573,335"]
+        filename, flavor="stream",
        parallel=parallel,
        table_regions=["320,460,573,335"]
    )
    assert_frame_equal(df, tables[0].df)
-def test_stream_table_areas():
+def test_stream_table_areas(parallel):
    df = pd.DataFrame(data_stream_table_areas)
    filename = os.path.join(testdir, "tabula/us-007.pdf")
    tables = camelot.read_pdf(
-        filename, flavor="stream", table_areas=["320,500,573,335"]
+        filename, flavor="stream",
        parallel=parallel,
        table_areas=["320,500,573,335"]
    )
    assert_frame_equal(df, tables[0].df)
-def test_stream_columns():
+def test_stream_columns(parallel):
    df = pd.DataFrame(data_stream_columns)
    filename = os.path.join(testdir, "mexican_towns.pdf")
    tables = camelot.read_pdf(
-        filename, flavor="stream", columns=["67,180,230,425,475"], row_tol=10
+        filename,
        flavor="stream",
        parallel=parallel,
        columns=["67,180,230,425,475"],
        row_tol=10
    )
    assert_frame_equal(df, tables[0].df)
-def test_stream_split_text():
+def test_stream_split_text(parallel):
    df = pd.DataFrame(data_stream_split_text)
    filename = os.path.join(testdir, "tabula/m27.pdf")
    tables = camelot.read_pdf(
        filename,
        flavor="stream",
        parallel=parallel,
        columns=["72,95,209,327,442,529,566,606,683"],
        split_text=True,
    )
    assert_frame_equal(df, tables[0].df)
-def test_stream_flag_size():
+def test_stream_flag_size(parallel):
    df = pd.DataFrame(data_stream_flag_size)
    filename = os.path.join(testdir, "superscript.pdf")
    tables = camelot.read_pdf(filename, flavor="stream", flag_size=True)
    assert_frame_equal(df, tables[0].df)
 def test_stream_strip_text():
    df = pd.DataFrame(data_stream_strip_text)
    filename = os.path.join(testdir, "detect_vertical_false.pdf")
    tables = camelot.read_pdf(filename, flavor="stream", strip_text=" ,\n")
    assert_frame_equal(df, tables[0].df)
 def test_stream_edge_tol():
    df = pd.DataFrame(data_stream_edge_tol)
    filename = os.path.join(testdir, "edge_tol.pdf")
    tables = camelot.read_pdf(filename, flavor="stream", edge_tol=500)
    assert_frame_equal(df, tables[0].df)
 def test_stream_layout_kwargs():
    df = pd.DataFrame(data_stream_layout_kwargs)
    filename = os.path.join(testdir, "detect_vertical_false.pdf")
    tables = camelot.read_pdf(
-        filename, flavor="stream", layout_kwargs={"detect_vertical": False}
+        filename,
        flavor="stream",
        parallel=parallel,
        flag_size=True
    )
    assert_frame_equal(df, tables[0].df)
-def test_lattice():
+def test_stream_strip_text(parallel):
    df = pd.DataFrame(data_stream_strip_text)
    filename = os.path.join(testdir, "detect_vertical_false.pdf")
    tables = camelot.read_pdf(
        filename, flavor="stream",
        parallel=parallel,
        strip_text=" ,\n"
    )
    assert_frame_equal(df, tables[0].df)
 def test_stream_edge_tol(parallel):
    df = pd.DataFrame(data_stream_edge_tol)
    filename = os.path.join(testdir, "edge_tol.pdf")
    tables = camelot.read_pdf(
        filename, flavor="stream",
        parallel=parallel,
        edge_tol=500
    )
    assert_frame_equal(df, tables[0].df)
 def test_stream_layout_kwargs(parallel):
    df = pd.DataFrame(data_stream_layout_kwargs)
    filename = os.path.join(testdir, "detect_vertical_false.pdf")
    tables = camelot.read_pdf(
        filename,
        flavor="stream",
        parallel=parallel,
        layout_kwargs={"detect_vertical": False}
    )
    assert_frame_equal(df, tables[0].df)
 def test_lattice(parallel):
    df = pd.DataFrame(data_lattice)
    filename = os.path.join(
        testdir, "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf"
    )
-    tables = camelot.read_pdf(filename, pages="2")
+    tables = camelot.read_pdf(filename, pages="2", parallel=parallel)
    assert_frame_equal(df, tables[0].df)
-def test_lattice_table_rotated():
+def test_lattice_table_rotated(parallel):
    df = pd.DataFrame(data_lattice_table_rotated)
    filename = os.path.join(testdir, "clockwise_table_1.pdf")
-    tables = camelot.read_pdf(filename)
+    tables = camelot.read_pdf(filename, parallel=parallel)
    assert_frame_equal(df, tables[0].df)
    filename = os.path.join(testdir, "anticlockwise_table_1.pdf")
-    tables = camelot.read_pdf(filename)
+    tables = camelot.read_pdf(filename, parallel=parallel)
    assert_frame_equal(df, tables[0].df)
-def test_lattice_two_tables():
+def test_lattice_two_tables(parallel):
    df1 = pd.DataFrame(data_lattice_two_tables_1)
    df2 = pd.DataFrame(data_lattice_two_tables_2)
    filename = os.path.join(testdir, "twotables_2.pdf")
-    tables = camelot.read_pdf(filename)
+    tables = camelot.read_pdf(filename, parallel=parallel)
    assert len(tables) == 2
    assert df1.equals(tables[0].df)
    assert df2.equals(tables[1].df)
-def test_lattice_table_regions():
+def test_lattice_table_regions(parallel):
    df = pd.DataFrame(data_lattice_table_regions)
    filename = os.path.join(testdir, "table_region.pdf")
-    tables = camelot.read_pdf(filename, table_regions=["170,370,560,270"])
+    tables = camelot.read_pdf(
        filename,
        parallel=parallel,
        table_regions=["170,370,560,270"]
    )
    assert_frame_equal(df, tables[0].df)
-def test_lattice_table_areas():
+def test_lattice_table_areas(parallel):
    df = pd.DataFrame(data_lattice_table_areas)
    filename = os.path.join(testdir, "twotables_2.pdf")
-    tables = camelot.read_pdf(filename, table_areas=["80,693,535,448"])
+    tables = camelot.read_pdf(
        filename,
        parallel=parallel,
        table_areas=["80,693,535,448"]
    )
    assert_frame_equal(df, tables[0].df)
-def test_lattice_process_background():
+def test_lattice_process_background(parallel):
    df = pd.DataFrame(data_lattice_process_background)
    filename = os.path.join(testdir, "background_lines_1.pdf")
-    tables = camelot.read_pdf(filename, process_background=True)
+    tables = camelot.read_pdf(
        filename,
        parallel=parallel,
        process_background=True,
    )
    assert_frame_equal(df, tables[1].df)
-def test_lattice_copy_text():
+def test_lattice_copy_text(parallel):
    df = pd.DataFrame(data_lattice_copy_text)
    filename = os.path.join(testdir, "row_span_1.pdf")
-    tables = camelot.read_pdf(filename, line_scale=60, copy_text="v")
+    tables = camelot.read_pdf(
        filename,
        parallel=parallel,
        line_scale=60,
        copy_text="v"
    )
    assert_frame_equal(df, tables[0].df)
-def test_lattice_shift_text():
+def test_lattice_shift_text(parallel):
    df_lt = pd.DataFrame(data_lattice_shift_text_left_top)
    df_disable = pd.DataFrame(data_lattice_shift_text_disable)
    df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom)
    filename = os.path.join(testdir, "column_span_2.pdf")
-    tables = camelot.read_pdf(filename, line_scale=40)
+    tables = camelot.read_pdf(filename, parallel=parallel, line_scale=40)
    assert df_lt.equals(tables[0].df)
-    tables = camelot.read_pdf(filename, line_scale=40, shift_text=[""])
+    tables = camelot.read_pdf(
        filename,
        parallel=parallel,
        line_scale=40,
        shift_text=[""]
    )
    assert df_disable.equals(tables[0].df)
-    tables = camelot.read_pdf(filename, line_scale=40, shift_text=["r", "b"])
+    tables = camelot.read_pdf(
        filename,
        parallel=parallel,
        line_scale=40,
        shift_text=["r", "b"]
    )
    assert df_rb.equals(tables[0].df)
-def test_repr():
+def test_repr(parallel):
    filename = os.path.join(testdir, "foo.pdf")
-    tables = camelot.read_pdf(filename)
+    tables = camelot.read_pdf(filename, parallel=parallel)
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
    assert (
@ -234,23 +290,23 @@ def test_repr():
    )
-def test_pages():
+def test_pages(parallel):
    url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
-    tables = camelot.read_pdf(url)
+    tables = camelot.read_pdf(url, parallel=parallel)
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
    assert (
        repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
    )
-    tables = camelot.read_pdf(url, pages="1-end")
+    tables = camelot.read_pdf(url, pages="1-end", parallel=parallel)
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
    assert (
        repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
    )
-    tables = camelot.read_pdf(url, pages="all")
+    tables = camelot.read_pdf(url, pages="all", parallel=parallel)
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
    assert (
@ -258,9 +314,9 @@ def test_pages():
    )
-def test_url():
+def test_url(parallel):
    url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
-    tables = camelot.read_pdf(url)
+    tables = camelot.read_pdf(url, parallel=parallel)
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
    assert (
@ -268,11 +324,11 @@ def test_url():
    )
-def test_arabic():
+def test_arabic(parallel):
    df = pd.DataFrame(data_arabic)
    filename = os.path.join(testdir, "tabula/arabic.pdf")
-    tables = camelot.read_pdf(filename)
+    tables = camelot.read_pdf(filename, parallel=parallel)
    assert_frame_equal(df, tables[0].df)