diff --git a/HISTORY.md b/HISTORY.md index 9b06b01..6a0a2e7 100755 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,6 +4,12 @@ Release History master ------ +**Improvements** + +* [#170](https://github.com/socialcopsdev/camelot/issues/170) Add option to pass pdfminer layout kwargs. [#232](https://github.com/socialcopsdev/camelot/pull/232) by Vinayak Mehta. + * Keyword arguments for [pdfminer.layout.LAParams](https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33) can now be passed using `layout_kwargs` in `read_pdf()`. + * The `margins` keyword argument in `read_pdf()` is now deprecated. + 0.5.0 (2018-12-13) ------------------ diff --git a/camelot/handlers.py b/camelot/handlers.py index a312131..35708ee 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -125,7 +125,7 @@ class PDFHandler(object): with open(fpath, 'wb') as f: outfile.write(f) - def parse(self, flavor='lattice', suppress_stdout=False, **kwargs): + def parse(self, flavor='lattice', suppress_stdout=False, layout_kwargs={}, **kwargs): """Extracts tables by calling parser.get_tables on all single page PDFs. @@ -136,6 +136,8 @@ class PDFHandler(object): Lattice is used by default. suppress_stdout : str (default: False) Suppress logs and warnings. + layout_kwargs : dict, optional (default: {}) + A dict of `pdfminer.layout.LAParams `_ kwargs. kwargs : dict See camelot.read_pdf kwargs. @@ -153,6 +155,7 @@ class PDFHandler(object): for p in self.pages] parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs) for p in pages: - t = parser.extract_tables(p, suppress_stdout=suppress_stdout) + t = parser.extract_tables(p, suppress_stdout=suppress_stdout, + layout_kwargs=layout_kwargs) tables.extend(t) return TableList(tables) diff --git a/camelot/io.py b/camelot/io.py index 4b436ff..de2052b 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -6,7 +6,7 @@ from .utils import validate_input, remove_extra def read_pdf(filepath, pages='1', password=None, flavor='lattice', - suppress_stdout=False, **kwargs): + suppress_stdout=False, layout_kwargs={}, **kwargs): """Read PDF and return extracted tables. Note: kwargs annotated with ^ can only be used with flavor='stream' @@ -26,6 +26,8 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice', Lattice is used by default. suppress_stdout : bool, optional (default: True) Print all logs and warnings. + layout_kwargs : dict, optional (default: {}) + A dict of `pdfminer.layout.LAParams `_ kwargs. table_areas : list, optional (default: None) List of table area strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom @@ -77,10 +79,6 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice', Number of times for erosion/dilation is applied. For more information, refer `OpenCV's dilate `_. - margins : tuple - PDFMiner char_margin, line_margin and word_margin. - - For more information, refer `PDFMiner docs `_. Returns ------- @@ -98,5 +96,6 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice', validate_input(kwargs, flavor=flavor) p = PDFHandler(filepath, pages=pages, password=password) kwargs = remove_extra(kwargs, flavor=flavor) - tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout, **kwargs) + tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout, + layout_kwargs=layout_kwargs, **kwargs) return tables diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index ebc4564..a3280de 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -8,13 +8,11 @@ from ..utils import get_page_layout, get_text_objects class BaseParser(object): """Defines a base parser. """ - def _generate_layout(self, filename): + def _generate_layout(self, filename, layout_kwargs): self.filename = filename + self.layout_kwargs = layout_kwargs self.layout, self.dimensions = get_page_layout( - self.filename, - char_margin=self.char_margin, - line_margin=self.line_margin, - word_margin=self.word_margin) + filename, **layout_kwargs) self.horizontal_text = get_text_objects(self.layout, ltype="lh") self.vertical_text = get_text_objects(self.layout, ltype="lv") self.pdf_width, self.pdf_height = self.dimensions diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index da3524f..0ec53bd 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -70,17 +70,13 @@ class Lattice(BaseParser): Number of times for erosion/dilation is applied. For more information, refer `OpenCV's dilate `_. - margins : tuple - PDFMiner char_margin, line_margin and word_margin. - - For more information, refer `PDFMiner docs `_. """ def __init__(self, table_areas=None, process_background=False, line_size_scaling=15, copy_text=None, shift_text=['l', 't'], split_text=False, flag_size=False, line_close_tol=2, joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2, - iterations=0, margins=(1.0, 0.5, 0.1), **kwargs): + iterations=0, **kwargs): self.table_areas = table_areas self.process_background = process_background self.line_size_scaling = line_size_scaling @@ -93,7 +89,6 @@ class Lattice(BaseParser): self.threshold_blocksize = threshold_blocksize self.threshold_constant = threshold_constant self.iterations = iterations - self.char_margin, self.line_margin, self.word_margin = margins @staticmethod def _reduce_index(t, idx, shift_text): @@ -348,8 +343,8 @@ class Lattice(BaseParser): return table - def extract_tables(self, filename, suppress_stdout=False): - self._generate_layout(filename) + def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}): + self._generate_layout(filename, layout_kwargs) if not suppress_stdout: logger.info('Processing {}'.format(os.path.basename(self.rootname))) diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 4bf482d..5ebd2df 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -44,15 +44,10 @@ class Stream(BaseParser): col_close_tol : int, optional (default: 0) Tolerance parameter used to combine text horizontally, to generate columns. - margins : tuple, optional (default: (1.0, 0.5, 0.1)) - PDFMiner char_margin, line_margin and word_margin. - - For more information, refer `PDFMiner docs `_. """ def __init__(self, table_areas=None, columns=None, split_text=False, - flag_size=False, row_close_tol=2, col_close_tol=0, - margins=(1.0, 0.5, 0.1), **kwargs): + flag_size=False, row_close_tol=2, col_close_tol=0, **kwargs): self.table_areas = table_areas self.columns = columns self._validate_columns() @@ -60,7 +55,6 @@ class Stream(BaseParser): self.flag_size = flag_size self.row_close_tol = row_close_tol self.col_close_tol = col_close_tol - self.char_margin, self.line_margin, self.word_margin = margins @staticmethod def _text_bbox(t_bbox): @@ -388,8 +382,8 @@ class Stream(BaseParser): return table - def extract_tables(self, filename, suppress_stdout=False): - self._generate_layout(filename) + def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}): + self._generate_layout(filename, layout_kwargs) if not suppress_stdout: logger.info('Processing {}'.format(os.path.basename(self.rootname))) diff --git a/camelot/utils.py b/camelot/utils.py index cd55e4e..c38884f 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -558,7 +558,7 @@ def compute_whitespace(d): def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1, - detect_vertical=True, all_texts=True): + detect_vertical=True, all_texts=True): """Returns a PDFMiner LTPage object and page dimension of a single page pdf. See https://euske.github.io/pdfminer/ to get definitions of kwargs. diff --git a/docs/user/advanced.rst b/docs/user/advanced.rst index 37e8d01..a36ea2c 100644 --- a/docs/user/advanced.rst +++ b/docs/user/advanced.rst @@ -524,3 +524,14 @@ We don't need anything else. Now, let's pass ``copy_text=['v']`` to copy text in "4","West Bengal","West Medinipur","iv. Acute Diarrhoeal Disease","145","0","04/01/14","05/01/14","Under control","..." "4","West Bengal","Birbhum","v. Food Poisoning","199","0","31/12/13","31/12/13","Under control","..." "4","West Bengal","Howrah","vi. Viral Hepatitis A &E","85","0","26/12/13","27/12/13","Under surveillance","..." + +Tweak layout generation +----------------------- + +Camelot is built on top of PDFMiner's functionality of grouping characters on a page into words and sentences. In some cases (such as `#170 `_ and `#215 `_), PDFMiner can group characters that should belong to the same sentence into separate sentences. + +To deal with such cases, you can tweak PDFMiner's `LAParams kwargs `_ to improve layout generation, by passing the keyword arguments as a dict using ``layout_kwargs`` in :meth:`read_pdf() `. To know more about the parameters you can tweak, you can check out `PDFMiner docs `_. + +:: + + >>> tables = camelot.read_pdf('foo.pdf', layout_kwargs={'detect_vertical': False}) diff --git a/tests/data.py b/tests/data.py index 677c58b..246a711 100755 --- a/tests/data.py +++ b/tests/data.py @@ -491,3 +491,43 @@ data_arabic = [ ['؟ﻙﺎﺒﺷ\xa0ﺖﻧﺍ\xa0ﻞﻫ', 'ﺔﻳﺰﻴﻠﺠﻧﻻﺍ\xa0ﻲﻓ\xa0Jeremy\xa0ﻲﻤﺳﺍ'], ['Jeremy\xa0is\xa0ﻲﻣﺮﺟ\xa0in\xa0Arabic', ''] ] + +data_stream_layout_kwargs = [ + ['V i n s a u Ve r r e', ''], + ['Les Blancs', '12.5CL'], + ['A.O.P Côtes du Rhône', ''], + ['Domaine de la Guicharde « Autour de la chapelle » 2016', '8 €'], + ['A.O.P Vacqueyras', ''], + ['Domaine de Montvac « Melodine » 2016', '10 €'], + ['A.O.P Châteauneuf du Pape', ''], + ['Domaine de Beaurenard 2017', '13 €'], + ['A.O.P Côteaux du Languedoc', ''], + ['Villa Tempora « Un temps pour elle » 2014', '9 €'], + ['A.O.P Côtes de Provence', ''], + ['Château Grand Boise 2017', '9 €'], + ['Les Rosés', '12,5 CL'], + ['A.O.P Côtes du Rhône', ''], + ['Domaine de la Florane « A fleur de Pampre » 2016', '8 €'], + ['Famille Coulon (Domaine Beaurenard) Biotifulfox 2017', '8 €'], + ['A.O.P Vacqueyras', ''], + ['Domaine de Montvac 2017', '9 €'], + ['A.O.P Languedoc', ''], + ['Domaine de Joncas « Nébla » 2015', '8 €'], + ['Villa Tempora « L’arroseur arrosé » 2015', '9 €'], + ['A.O.P Côtes de Provence', ''], + ['Château Grand Boise « Sainte Victoire » 2017', '9 €'], + ['Château Léoube 2016', '10 €'], + ['Les Rouges', '12,CL'], + ['A.O.P Côtes du Rhône', ''], + ['Domaine de Dionysos « La Cigalette »', '8 €'], + ['Château Saint Estève d’Uchaux « Grande Réserve » 2014', '9 €'], + ['Domaine de la Guicharde « Cuvée Massillan » 2016', '9 €'], + ['Domaine de la Florane « Terre Pourpre » 2014', '10 €'], + ['L’Oratoire St Martin « Réserve des Seigneurs » 2015', '11 €'], + ['A.O.P Saint Joseph', ''], + ['Domaine Monier Perréol « Châtelet » 2015', '13 €'], + ['A.O.P Châteauneuf du Pape', ''], + ['Domaine de Beaurenard 2011', '15 €'], + ['A.O.P Cornas', ''], + ['Domaine Lionnet « Terre Brûlée » 2012', '15 €'] +] diff --git a/tests/files/detect_vertical_false.pdf b/tests/files/detect_vertical_false.pdf new file mode 100644 index 0000000..17d8a0d Binary files /dev/null and b/tests/files/detect_vertical_false.pdf differ diff --git a/tests/test_common.py b/tests/test_common.py index 5f8c81c..34ae94f 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -102,6 +102,15 @@ def test_stream_flag_size(): assert df.equals(tables[0].df) +def test_stream_layout_kwargs(): + df = pd.DataFrame(data_stream_layout_kwargs) + + filename = os.path.join(testdir, "detect_vertical_false.pdf") + tables = camelot.read_pdf( + filename, flavor="stream", layout_kwargs={"detect_vertical": False}) + assert df.equals(tables[0].df) + + def test_lattice(): df = pd.DataFrame(data_lattice)