diff --git a/camelot/handlers.py b/camelot/handlers.py index 3a6d663..9c7c384 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -34,7 +34,7 @@ class PDFHandler(object): """ - def __init__(self, filepath, pages="1", password=None): + def __init__(self, filepath, pages="1", password=None, multi=[]): if is_url(filepath): filepath = download_url(filepath) self.filepath = filepath @@ -48,6 +48,7 @@ class PDFHandler(object): if sys.version_info[0] < 3: self.password = self.password.encode("ascii") self.pages = self._get_pages(self.filepath, pages) + self.multi = multi def _get_pages(self, filepath, pages): """Converts pages string to list of ints. @@ -168,7 +169,16 @@ class PDFHandler(object): ] parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs) for p in pages: - t = parser.extract_tables( + p_no = p[-5] + + page_kwargs = kwargs + page_parser = parser + + if p_no in self.multi: + page_kwargs.update(self.multi[p_no]) + page_parser = Lattice(**page_kwargs) if flavor == 'lattice' else Stream(**page_kwargs) + + t = page_parser.extract_tables( p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs ) tables.extend(t) diff --git a/camelot/io.py b/camelot/io.py index a27a7c6..2988a6f 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -13,6 +13,7 @@ def read_pdf( flavor="lattice", suppress_stdout=False, layout_kwargs={}, + multi = {}, **kwargs ): """Read PDF and return extracted tables. @@ -36,6 +37,9 @@ def read_pdf( Print all logs and warnings. layout_kwargs : dict, optional (default: {}) A dict of `pdfminer.layout.LAParams `_ kwargs. + multi: dict, optional(default: {}) + A dict to enter parameters specific only for a page. Key: Page(str) to dict(defined params). + Parameters defined in multi overwrite kwargs for that page table_areas : list, optional (default: None) List of table area strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom @@ -108,7 +112,7 @@ def read_pdf( warnings.simplefilter("ignore") validate_input(kwargs, flavor=flavor) - p = PDFHandler(filepath, pages=pages, password=password) + p = PDFHandler(filepath, pages=pages, password=password, multi=multi) kwargs = remove_extra(kwargs, flavor=flavor) tables = p.parse( flavor=flavor, diff --git a/tests/data.py b/tests/data.py index 3338a81..dafdcd1 100755 --- a/tests/data.py +++ b/tests/data.py @@ -2858,3 +2858,20 @@ data_stream_layout_kwargs = [ ["A.O.P Cornas", ""], ["Domaine Lionnet « Terre Brûlée » 2012", "15 €"], ] + +data_multi_params1 = [ + ["Number of Coils", "Number of Paperclips"], + ["5", "3, 5, 4"], + ["10", "7, 8, 6"], + ["15", "11, 10, 12"], + ["20", "15, 13, 14"] +] + +data_multi_params2 = [ + ["Time (drops of water)", "Distance (cm)"], + ["1", "10,11,9"], + ["2", "29, 31, 30"], + ["3", "59, 58, 61"], + ["4", "102, 100, 98"], + ["5", "122, 125, 127"] +] diff --git a/tests/files/multi_params.pdf b/tests/files/multi_params.pdf new file mode 100644 index 0000000..0fe1113 Binary files /dev/null and b/tests/files/multi_params.pdf differ diff --git a/tests/test_common.py b/tests/test_common.py index 0bf8f61..6ed1743 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -298,3 +298,16 @@ def test_table_order(): (1, 2), (1, 1), ] + +def test_multi_params(): + df1 = pd.DataFrame(data_multi_params1) + df2 = pd.DataFrame(data_multi_params2) + + filename = os.path.join( + testdir, "multi_params.pdf" + ) + tables = camelot.read_pdf(filename, pages="all", multi={'2': {"table_regions": ["120, 210, 400, 90"]}}, + split_text=True) + + assert df1.equals(tables[0].df) + assert df2.equals(tables[1].df)