Merge 63161fe379 into 7709e58d64

2021-05-01 20:28:40 +05:30 · 2021-05-01 20:28:40 +05:30 · 620a2caaae
parent 7709e58d64 63161fe379
commit 620a2caaae
4 changed files with 202 additions and 88 deletions
--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@ -2,6 +2,7 @@

 import os
 import sys
+import multiprocessing as mp

 from PyPDF2 import PdfFileReader, PdfFileWriter

@ -140,7 +141,12 @@ class PDFHandler(object):
                instream.close()

    def parse(
-        self, flavor="lattice", suppress_stdout=False, layout_kwargs={}, **kwargs
+        self,
+        flavor="lattice",
+        suppress_stdout=False,
+        parallel=False,
+        layout_kwargs={},
+        **kwargs
    ):
        """Extracts tables by calling parser.get_tables on all single
        page PDFs.
@ -150,8 +156,10 @@ class PDFHandler(object):
        flavor : str (default: 'lattice')
            The parsing method to use ('lattice' or 'stream').
            Lattice is used by default.
-        suppress_stdout : str (default: False)
+        suppress_stdout : bool (default: False)
            Suppress logs and warnings.
+        parallel : bool (default: False)
+            Process pages in parallel using all available cpu cores.
        layout_kwargs : dict, optional (default: {})
            A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
        kwargs : dict
@ -164,16 +172,54 @@ class PDFHandler(object):

        """
        tables = []
-        with TemporaryDirectory() as tempdir:
-            for p in self.pages:
-                self._save_page(self.filepath, p, tempdir)
-            pages = [
-                os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages
-            ]
        parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
-            for p in pages:
-                t = parser.extract_tables(
-                    p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
-                )
+        with TemporaryDirectory() as tempdir:
+            cpu_count = mp.cpu_count()
+            # Using multiprocessing only when cpu_count > 1 to prevent
+            # a stallness issue when cpu_count is 1
+            if parallel and cpu_count > 1:
+                with mp.get_context("spawn").Pool(processes=cpu_count) as pool:
+                    jobs = [
+                        pool.apply_async(
+                            self._parse_page,
+                            (p, tempdir, parser, suppress_stdout, layout_kwargs)
+                        ) for p in self.pages
+                    ]
+                    for j in jobs:
+                        t = j.get()
+                        tables.extend(t)
+            else:
+                for p in self.pages:
+                    t = self._parse_page(p, tempdir, parser, suppress_stdout, layout_kwargs)
                    tables.extend(t)
        return TableList(sorted(tables))
+
+    def _parse_page(
+        self, page, tempdir, parser, suppress_stdout, layout_kwargs
+    ):
+        """Extracts tables by calling parser.get_tables on a single
+        page PDF.
+
+        Parameters
+        ----------
+        page : str
+            Page number to parse
+        parser : Lattice or Stream
+            The parser to use (Lattice or Stream).
+        suppress_stdout : bool
+            Suppress logs and warnings.
+        layout_kwargs : dict, optional (default: {})
+            A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
+
+        Returns
+        -------
+        tables : camelot.core.TableList
+            List of tables found in PDF.
+        
+        """
+        self._save_page(self.filepath, page, tempdir)
+        page_path = os.path.join(tempdir, f"page-{page}.pdf")
+        tables = parser.extract_tables(
+            page_path, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
+        )
+        return tables
--- a/camelot/io.py
+++ b/camelot/io.py
@ -12,6 +12,7 @@ def read_pdf(
    password=None,
    flavor="lattice",
    suppress_stdout=False,
+    parallel=False,
    layout_kwargs={},
    **kwargs
 ):
@ -34,6 +35,8 @@ def read_pdf(
        Lattice is used by default.
    suppress_stdout : bool, optional (default: True)
        Print all logs and warnings.
+    parallel : bool, optional (default: False)
+        Process pages in parallel using all available cpu cores.
    layout_kwargs : dict, optional (default: {})
        A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
    table_areas : list, optional (default: None)
@ -113,6 +116,7 @@ def read_pdf(
        tables = p.parse(
            flavor=flavor,
            suppress_stdout=suppress_stdout,
+            parallel=parallel,
            layout_kwargs=layout_kwargs,
            **kwargs
        )
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -0,0 +1,8 @@
+import pytest
+
+def pytest_generate_tests(metafunc):
+    if "parallel" in metafunc.fixturenames:
+        metafunc.parametrize("parallel", [
+            pytest.param(True, id="parallel=True"),
+            pytest.param(False, id="parallel=False")
+        ])
--- a/tests/test_common.py
+++ b/tests/test_common.py
@ -14,219 +14,275 @@ from .data import *
 testdir = os.path.dirname(os.path.abspath(__file__))
 testdir = os.path.join(testdir, "files")

-
-def test_parsing_report():
+def test_parsing_report(parallel):
    parsing_report = {"accuracy": 99.02, "whitespace": 12.24, "order": 1, "page": 1}

    filename = os.path.join(testdir, "foo.pdf")
-    tables = camelot.read_pdf(filename)
+    tables = camelot.read_pdf(filename, parallel=parallel)
    assert tables[0].parsing_report == parsing_report


-def test_password():
+
+def test_password(parallel):
    df = pd.DataFrame(data_stream)

    filename = os.path.join(testdir, "health_protected.pdf")
-    tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream")
+    tables = camelot.read_pdf(filename, password="ownerpass", flavor="stream", parallel=parallel)
    assert_frame_equal(df, tables[0].df)

-    tables = camelot.read_pdf(filename, password="userpass", flavor="stream")
+    tables = camelot.read_pdf(filename, password="userpass", flavor="stream", parallel=parallel)
    assert_frame_equal(df, tables[0].df)


-def test_stream():
+
+def test_stream(parallel):
    df = pd.DataFrame(data_stream)

    filename = os.path.join(testdir, "health.pdf")
-    tables = camelot.read_pdf(filename, flavor="stream")
+    tables = camelot.read_pdf(filename, flavor="stream", parallel=parallel)
    assert_frame_equal(df, tables[0].df)


-def test_stream_table_rotated():
+
+def test_stream_table_rotated(parallel):
    df = pd.DataFrame(data_stream_table_rotated)

    filename = os.path.join(testdir, "clockwise_table_2.pdf")
-    tables = camelot.read_pdf(filename, flavor="stream")
+    tables = camelot.read_pdf(filename, flavor="stream", parallel=parallel)
    assert_frame_equal(df, tables[0].df)

    filename = os.path.join(testdir, "anticlockwise_table_2.pdf")
-    tables = camelot.read_pdf(filename, flavor="stream")
+    tables = camelot.read_pdf(filename, flavor="stream", parallel=parallel)
    assert_frame_equal(df, tables[0].df)


-def test_stream_two_tables():
+
+def test_stream_two_tables(parallel):
    df1 = pd.DataFrame(data_stream_two_tables_1)
    df2 = pd.DataFrame(data_stream_two_tables_2)

    filename = os.path.join(testdir, "tabula/12s0324.pdf")
-    tables = camelot.read_pdf(filename, flavor="stream")
+    tables = camelot.read_pdf(filename, flavor="stream", parallel=parallel)

    assert len(tables) == 2
    assert df1.equals(tables[0].df)
    assert df2.equals(tables[1].df)


-def test_stream_table_regions():
+
+def test_stream_table_regions(parallel):
    df = pd.DataFrame(data_stream_table_areas)

    filename = os.path.join(testdir, "tabula/us-007.pdf")
    tables = camelot.read_pdf(
-        filename, flavor="stream", table_regions=["320,460,573,335"]
+        filename, flavor="stream",
+        parallel=parallel,
+        table_regions=["320,460,573,335"]
    )
    assert_frame_equal(df, tables[0].df)


-def test_stream_table_areas():
+def test_stream_table_areas(parallel):
    df = pd.DataFrame(data_stream_table_areas)

    filename = os.path.join(testdir, "tabula/us-007.pdf")
    tables = camelot.read_pdf(
-        filename, flavor="stream", table_areas=["320,500,573,335"]
+        filename, flavor="stream",
+        parallel=parallel,
+        table_areas=["320,500,573,335"]
    )
    assert_frame_equal(df, tables[0].df)


-def test_stream_columns():
+def test_stream_columns(parallel):
    df = pd.DataFrame(data_stream_columns)

    filename = os.path.join(testdir, "mexican_towns.pdf")
    tables = camelot.read_pdf(
-        filename, flavor="stream", columns=["67,180,230,425,475"], row_tol=10
+        filename,
+        flavor="stream",
+        parallel=parallel,
+        columns=["67,180,230,425,475"],
+        row_tol=10
    )
    assert_frame_equal(df, tables[0].df)


-def test_stream_split_text():
+def test_stream_split_text(parallel):
    df = pd.DataFrame(data_stream_split_text)

    filename = os.path.join(testdir, "tabula/m27.pdf")
    tables = camelot.read_pdf(
        filename,
        flavor="stream",
+        parallel=parallel,
        columns=["72,95,209,327,442,529,566,606,683"],
        split_text=True,
    )
    assert_frame_equal(df, tables[0].df)


-def test_stream_flag_size():
+def test_stream_flag_size(parallel):
    df = pd.DataFrame(data_stream_flag_size)

    filename = os.path.join(testdir, "superscript.pdf")
-    tables = camelot.read_pdf(filename, flavor="stream", flag_size=True)
-    assert_frame_equal(df, tables[0].df)
-
-
-def test_stream_strip_text():
-    df = pd.DataFrame(data_stream_strip_text)
-
-    filename = os.path.join(testdir, "detect_vertical_false.pdf")
-    tables = camelot.read_pdf(filename, flavor="stream", strip_text=" ,\n")
-    assert_frame_equal(df, tables[0].df)
-
-
-def test_stream_edge_tol():
-    df = pd.DataFrame(data_stream_edge_tol)
-
-    filename = os.path.join(testdir, "edge_tol.pdf")
-    tables = camelot.read_pdf(filename, flavor="stream", edge_tol=500)
-    assert_frame_equal(df, tables[0].df)
-
-
-def test_stream_layout_kwargs():
-    df = pd.DataFrame(data_stream_layout_kwargs)
-
-    filename = os.path.join(testdir, "detect_vertical_false.pdf")
    tables = camelot.read_pdf(
-        filename, flavor="stream", layout_kwargs={"detect_vertical": False}
+        filename,
+        flavor="stream",
+        parallel=parallel,
+        flag_size=True
    )
    assert_frame_equal(df, tables[0].df)


-def test_lattice():
+def test_stream_strip_text(parallel):
+    df = pd.DataFrame(data_stream_strip_text)
+
+    filename = os.path.join(testdir, "detect_vertical_false.pdf")
+    tables = camelot.read_pdf(
+        filename, flavor="stream",
+        parallel=parallel,
+        strip_text=" ,\n"
+    )
+    assert_frame_equal(df, tables[0].df)
+
+
+def test_stream_edge_tol(parallel):
+    df = pd.DataFrame(data_stream_edge_tol)
+
+    filename = os.path.join(testdir, "edge_tol.pdf")
+    tables = camelot.read_pdf(
+        filename, flavor="stream",
+        parallel=parallel,
+        edge_tol=500
+    )
+    assert_frame_equal(df, tables[0].df)
+
+
+def test_stream_layout_kwargs(parallel):
+    df = pd.DataFrame(data_stream_layout_kwargs)
+
+    filename = os.path.join(testdir, "detect_vertical_false.pdf")
+    tables = camelot.read_pdf(
+        filename,
+        flavor="stream",
+        parallel=parallel,
+        layout_kwargs={"detect_vertical": False}
+    )
+    assert_frame_equal(df, tables[0].df)
+
+
+def test_lattice(parallel):
    df = pd.DataFrame(data_lattice)

    filename = os.path.join(
        testdir, "tabula/icdar2013-dataset/competition-dataset-us/us-030.pdf"
    )
-    tables = camelot.read_pdf(filename, pages="2")
+    tables = camelot.read_pdf(filename, pages="2", parallel=parallel)
    assert_frame_equal(df, tables[0].df)


-def test_lattice_table_rotated():
+def test_lattice_table_rotated(parallel):
    df = pd.DataFrame(data_lattice_table_rotated)

    filename = os.path.join(testdir, "clockwise_table_1.pdf")
-    tables = camelot.read_pdf(filename)
+    tables = camelot.read_pdf(filename, parallel=parallel)
    assert_frame_equal(df, tables[0].df)

    filename = os.path.join(testdir, "anticlockwise_table_1.pdf")
-    tables = camelot.read_pdf(filename)
+    tables = camelot.read_pdf(filename, parallel=parallel)
    assert_frame_equal(df, tables[0].df)


-def test_lattice_two_tables():
+def test_lattice_two_tables(parallel):
    df1 = pd.DataFrame(data_lattice_two_tables_1)
    df2 = pd.DataFrame(data_lattice_two_tables_2)

    filename = os.path.join(testdir, "twotables_2.pdf")
-    tables = camelot.read_pdf(filename)
+    tables = camelot.read_pdf(filename, parallel=parallel)
    assert len(tables) == 2
    assert df1.equals(tables[0].df)
    assert df2.equals(tables[1].df)


-def test_lattice_table_regions():
+def test_lattice_table_regions(parallel):
    df = pd.DataFrame(data_lattice_table_regions)

    filename = os.path.join(testdir, "table_region.pdf")
-    tables = camelot.read_pdf(filename, table_regions=["170,370,560,270"])
+    tables = camelot.read_pdf(
+        filename,
+        parallel=parallel,
+        table_regions=["170,370,560,270"]
+    )
    assert_frame_equal(df, tables[0].df)


-def test_lattice_table_areas():
+def test_lattice_table_areas(parallel):
    df = pd.DataFrame(data_lattice_table_areas)

    filename = os.path.join(testdir, "twotables_2.pdf")
-    tables = camelot.read_pdf(filename, table_areas=["80,693,535,448"])
+    tables = camelot.read_pdf(
+        filename,
+        parallel=parallel,
+        table_areas=["80,693,535,448"]
+    )
    assert_frame_equal(df, tables[0].df)


-def test_lattice_process_background():
+def test_lattice_process_background(parallel):
    df = pd.DataFrame(data_lattice_process_background)

    filename = os.path.join(testdir, "background_lines_1.pdf")
-    tables = camelot.read_pdf(filename, process_background=True)
+    tables = camelot.read_pdf(
+        filename,
+        parallel=parallel,
+        process_background=True,
+    )
    assert_frame_equal(df, tables[1].df)


-def test_lattice_copy_text():
+def test_lattice_copy_text(parallel):
    df = pd.DataFrame(data_lattice_copy_text)

    filename = os.path.join(testdir, "row_span_1.pdf")
-    tables = camelot.read_pdf(filename, line_scale=60, copy_text="v")
+    tables = camelot.read_pdf(
+        filename,
+        parallel=parallel,
+        line_scale=60,
+        copy_text="v"
+    )
    assert_frame_equal(df, tables[0].df)


-def test_lattice_shift_text():
+def test_lattice_shift_text(parallel):
    df_lt = pd.DataFrame(data_lattice_shift_text_left_top)
    df_disable = pd.DataFrame(data_lattice_shift_text_disable)
    df_rb = pd.DataFrame(data_lattice_shift_text_right_bottom)

    filename = os.path.join(testdir, "column_span_2.pdf")
-    tables = camelot.read_pdf(filename, line_scale=40)
+    tables = camelot.read_pdf(filename, parallel=parallel, line_scale=40)
    assert df_lt.equals(tables[0].df)

-    tables = camelot.read_pdf(filename, line_scale=40, shift_text=[""])
+    tables = camelot.read_pdf(
+        filename,
+        parallel=parallel,
+        line_scale=40,
+        shift_text=[""]
+    )
    assert df_disable.equals(tables[0].df)

-    tables = camelot.read_pdf(filename, line_scale=40, shift_text=["r", "b"])
+    tables = camelot.read_pdf(
+        filename,
+        parallel=parallel,
+        line_scale=40,
+        shift_text=["r", "b"]
+    )
    assert df_rb.equals(tables[0].df)


-def test_repr():
+def test_repr(parallel):
    filename = os.path.join(testdir, "foo.pdf")
-    tables = camelot.read_pdf(filename)
+    tables = camelot.read_pdf(filename, parallel=parallel)
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
    assert (
@ -234,23 +290,23 @@ def test_repr():
    )


-def test_pages():
+def test_pages(parallel):
    url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
-    tables = camelot.read_pdf(url)
+    tables = camelot.read_pdf(url, parallel=parallel)
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
    assert (
        repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
    )

-    tables = camelot.read_pdf(url, pages="1-end")
+    tables = camelot.read_pdf(url, pages="1-end", parallel=parallel)
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
    assert (
        repr(tables[0].cells[0][0]) == "<Cell x1=120.48 y1=218.43 x2=164.64 y2=233.77>"
    )

-    tables = camelot.read_pdf(url, pages="all")
+    tables = camelot.read_pdf(url, pages="all", parallel=parallel)
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
    assert (
@ -258,9 +314,9 @@ def test_pages():
    )


-def test_url():
+def test_url(parallel):
    url = "https://camelot-py.readthedocs.io/en/master/_static/pdf/foo.pdf"
-    tables = camelot.read_pdf(url)
+    tables = camelot.read_pdf(url, parallel=parallel)
    assert repr(tables) == "<TableList n=1>"
    assert repr(tables[0]) == "<Table shape=(7, 7)>"
    assert (
@ -268,11 +324,11 @@ def test_url():
    )


-def test_arabic():
+def test_arabic(parallel):
    df = pd.DataFrame(data_arabic)

    filename = os.path.join(testdir, "tabula/arabic.pdf")
-    tables = camelot.read_pdf(filename)
+    tables = camelot.read_pdf(filename, parallel=parallel)
    assert_frame_equal(df, tables[0].df)