Merge de75c05fc0 into 83f816f104

2019-10-15 12:11:00 +00:00 · 2019-10-15 12:11:00 +00:00 · 39fec76cd1
parent 83f816f104 de75c05fc0
commit 39fec76cd1
5 changed files with 47 additions and 3 deletions
--- a/camelot/handlers.py
+++ b/camelot/handlers.py
@ -34,7 +34,7 @@ class PDFHandler(object):

    """

-    def __init__(self, filepath, pages="1", password=None):
+    def __init__(self, filepath, pages="1", password=None, multi=[]):
        if is_url(filepath):
            filepath = download_url(filepath)
        self.filepath = filepath
@ -48,6 +48,7 @@ class PDFHandler(object):
            if sys.version_info[0] < 3:
                self.password = self.password.encode("ascii")
        self.pages = self._get_pages(self.filepath, pages)
+        self.multi = multi

    def _get_pages(self, filepath, pages):
        """Converts pages string to list of ints.
@ -168,7 +169,16 @@ class PDFHandler(object):
            ]
            parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
            for p in pages:
-                t = parser.extract_tables(
+                p_no = p[-5]
+
+                page_kwargs = kwargs
+                page_parser = parser
+
+                if p_no in self.multi:
+                    page_kwargs.update(self.multi[p_no])
+                    page_parser = Lattice(**page_kwargs) if flavor == 'lattice' else Stream(**page_kwargs)
+
+                t = page_parser.extract_tables(
                    p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs
                )
                tables.extend(t)
--- a/camelot/io.py
+++ b/camelot/io.py
@ -13,6 +13,7 @@ def read_pdf(
    flavor="lattice",
    suppress_stdout=False,
    layout_kwargs={},
+    multi = {},
    **kwargs
 ):
    """Read PDF and return extracted tables.
@ -36,6 +37,9 @@ def read_pdf(
        Print all logs and warnings.
    layout_kwargs : dict, optional (default: {})
        A dict of `pdfminer.layout.LAParams <https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
+    multi: dict, optional(default: {})
+        A dict to enter parameters specific only for a page. Key: Page(str) to dict(defined params).
+        Parameters defined in multi overwrite kwargs for that page
    table_areas : list, optional (default: None)
        List of table area strings of the form x1,y1,x2,y2
        where (x1, y1) -> left-top and (x2, y2) -> right-bottom
@ -108,7 +112,7 @@ def read_pdf(
            warnings.simplefilter("ignore")

        validate_input(kwargs, flavor=flavor)
-        p = PDFHandler(filepath, pages=pages, password=password)
+        p = PDFHandler(filepath, pages=pages, password=password, multi=multi)
        kwargs = remove_extra(kwargs, flavor=flavor)
        tables = p.parse(
            flavor=flavor,
--- a/tests/data.py
+++ b/tests/data.py
@ -2858,3 +2858,20 @@ data_stream_layout_kwargs = [
    ["A.O.P Cornas", ""],
    ["Domaine Lionnet « Terre Brûlée » 2012", "15 €"],
 ]
+
+data_multi_params1 = [
+    ["Number of Coils", "Number of Paperclips"],
+    ["5", "3, 5, 4"],
+    ["10", "7, 8, 6"],
+    ["15", "11, 10, 12"],
+    ["20", "15, 13, 14"]
+]
+
+data_multi_params2 = [
+    ["Time (drops of water)", "Distance (cm)"],
+    ["1", "10,11,9"],
+    ["2", "29, 31, 30"],
+    ["3", "59, 58, 61"],
+    ["4", "102, 100, 98"],
+    ["5", "122, 125, 127"]
+]
--- a/tests/files/multi_params.pdf
+++ b/tests/files/multi_params.pdf
--- a/tests/test_common.py
+++ b/tests/test_common.py
@ -298,3 +298,16 @@ def test_table_order():
        (1, 2),
        (1, 1),
    ]
+
+def test_multi_params():
+    df1 = pd.DataFrame(data_multi_params1)
+    df2 = pd.DataFrame(data_multi_params2)
+
+    filename = os.path.join(
+        testdir, "multi_params.pdf"
+    )
+    tables = camelot.read_pdf(filename, pages="all", multi={'2': {"table_regions": ["120, 210, 400, 90"]}},
+                              split_text=True)
+
+    assert df1.equals(tables[0].df)
+    assert df2.equals(tables[1].df)