diff --git a/camelot/handlers.py b/camelot/handlers.py index a312131..091e3b7 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -125,7 +125,7 @@ class PDFHandler(object): with open(fpath, 'wb') as f: outfile.write(f) - def parse(self, flavor='lattice', suppress_stdout=False, **kwargs): + def parse(self, flavor='lattice', suppress_stdout=False, extra_kwargs={}, **kwargs): """Extracts tables by calling parser.get_tables on all single page PDFs. @@ -136,6 +136,8 @@ class PDFHandler(object): Lattice is used by default. suppress_stdout : str (default: False) Suppress logs and warnings. + extra_kwargs : dict, optional (default: {}) + A dict of pdfminer.layout.LAParams kwargs. kwargs : dict See camelot.read_pdf kwargs. @@ -153,6 +155,7 @@ class PDFHandler(object): for p in self.pages] parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs) for p in pages: - t = parser.extract_tables(p, suppress_stdout=suppress_stdout) + t = parser.extract_tables(p, suppress_stdout=suppress_stdout + extra_kwargs=extra_kwargs) tables.extend(t) return TableList(tables) diff --git a/camelot/io.py b/camelot/io.py index 4b436ff..c7321a3 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -6,7 +6,7 @@ from .utils import validate_input, remove_extra def read_pdf(filepath, pages='1', password=None, flavor='lattice', - suppress_stdout=False, **kwargs): + suppress_stdout=False, extra_kwargs={}, **kwargs): """Read PDF and return extracted tables. Note: kwargs annotated with ^ can only be used with flavor='stream' @@ -26,6 +26,8 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice', Lattice is used by default. suppress_stdout : bool, optional (default: True) Print all logs and warnings. + extra_kwargs : dict, optional (default: {}) + A dict of pdfminer.layout.LAParams kwargs. table_areas : list, optional (default: None) List of table area strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom @@ -98,5 +100,6 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice', validate_input(kwargs, flavor=flavor) p = PDFHandler(filepath, pages=pages, password=password) kwargs = remove_extra(kwargs, flavor=flavor) - tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout, **kwargs) + tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout, + extra_kwargs=extra_kwargs, **kwargs) return tables diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index ebc4564..35fcdd1 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -8,13 +8,10 @@ from ..utils import get_page_layout, get_text_objects class BaseParser(object): """Defines a base parser. """ - def _generate_layout(self, filename): + def _generate_layout(self, filename, extra_kwargs): self.filename = filename self.layout, self.dimensions = get_page_layout( - self.filename, - char_margin=self.char_margin, - line_margin=self.line_margin, - word_margin=self.word_margin) + self.filename, **self.extra_kwargs) self.horizontal_text = get_text_objects(self.layout, ltype="lh") self.vertical_text = get_text_objects(self.layout, ltype="lv") self.pdf_width, self.pdf_height = self.dimensions diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index da3524f..b89452e 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -348,8 +348,8 @@ class Lattice(BaseParser): return table - def extract_tables(self, filename, suppress_stdout=False): - self._generate_layout(filename) + def extract_tables(self, filename, suppress_stdout=False, extra_kwargs={}): + self._generate_layout(filename, extra_kwargs) if not suppress_stdout: logger.info('Processing {}'.format(os.path.basename(self.rootname))) diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 4bf482d..a478e0c 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -388,8 +388,8 @@ class Stream(BaseParser): return table - def extract_tables(self, filename, suppress_stdout=False): - self._generate_layout(filename) + def extract_tables(self, filename, suppress_stdout=False, extra_kwargs={}): + self._generate_layout(filename, extra_kwargs) if not suppress_stdout: logger.info('Processing {}'.format(os.path.basename(self.rootname))) diff --git a/camelot/utils.py b/camelot/utils.py index cd55e4e..c38884f 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -558,7 +558,7 @@ def compute_whitespace(d): def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1, - detect_vertical=True, all_texts=True): + detect_vertical=True, all_texts=True): """Returns a PDFMiner LTPage object and page dimension of a single page pdf. See https://euske.github.io/pdfminer/ to get definitions of kwargs.