From b167a8c7dd0f43ab54b3cd5f51f888cacdf16fce Mon Sep 17 00:00:00 2001 From: Sahil Verma Date: Sun, 28 Jul 2019 10:20:47 -0500 Subject: [PATCH] Added multi parameter for page level parameters --- camelot/handlers.py | 14 ++++++++++++-- camelot/io.py | 6 +++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/camelot/handlers.py b/camelot/handlers.py index 3a6d663..9c7c384 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -34,7 +34,7 @@ class PDFHandler(object): """ - def __init__(self, filepath, pages="1", password=None): + def __init__(self, filepath, pages="1", password=None, multi=[]): if is_url(filepath): filepath = download_url(filepath) self.filepath = filepath @@ -48,6 +48,7 @@ class PDFHandler(object): if sys.version_info[0] < 3: self.password = self.password.encode("ascii") self.pages = self._get_pages(self.filepath, pages) + self.multi = multi def _get_pages(self, filepath, pages): """Converts pages string to list of ints. @@ -168,7 +169,16 @@ class PDFHandler(object): ] parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs) for p in pages: - t = parser.extract_tables( + p_no = p[-5] + + page_kwargs = kwargs + page_parser = parser + + if p_no in self.multi: + page_kwargs.update(self.multi[p_no]) + page_parser = Lattice(**page_kwargs) if flavor == 'lattice' else Stream(**page_kwargs) + + t = page_parser.extract_tables( p, suppress_stdout=suppress_stdout, layout_kwargs=layout_kwargs ) tables.extend(t) diff --git a/camelot/io.py b/camelot/io.py index a27a7c6..2988a6f 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -13,6 +13,7 @@ def read_pdf( flavor="lattice", suppress_stdout=False, layout_kwargs={}, + multi = {}, **kwargs ): """Read PDF and return extracted tables. @@ -36,6 +37,9 @@ def read_pdf( Print all logs and warnings. layout_kwargs : dict, optional (default: {}) A dict of `pdfminer.layout.LAParams `_ kwargs. + multi: dict, optional(default: {}) + A dict to enter parameters specific only for a page. Key: Page(str) to dict(defined params). + Parameters defined in multi overwrite kwargs for that page table_areas : list, optional (default: None) List of table area strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom @@ -108,7 +112,7 @@ def read_pdf( warnings.simplefilter("ignore") validate_input(kwargs, flavor=flavor) - p = PDFHandler(filepath, pages=pages, password=password) + p = PDFHandler(filepath, pages=pages, password=password, multi=multi) kwargs = remove_extra(kwargs, flavor=flavor) tables = p.parse( flavor=flavor,