From ca6cefa36268e6e8fb5fa73d9898fd82789c2cb8 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Mon, 17 Dec 2018 11:49:05 +0530 Subject: [PATCH 1/6] Add extra_kwargs --- camelot/handlers.py | 7 +++++-- camelot/io.py | 7 +++++-- camelot/parsers/base.py | 7 ++----- camelot/parsers/lattice.py | 4 ++-- camelot/parsers/stream.py | 4 ++-- camelot/utils.py | 2 +- 6 files changed, 17 insertions(+), 14 deletions(-) diff --git a/camelot/handlers.py b/camelot/handlers.py index a312131..091e3b7 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -125,7 +125,7 @@ class PDFHandler(object): with open(fpath, 'wb') as f: outfile.write(f) - def parse(self, flavor='lattice', suppress_stdout=False, **kwargs): + def parse(self, flavor='lattice', suppress_stdout=False, extra_kwargs={}, **kwargs): """Extracts tables by calling parser.get_tables on all single page PDFs. @@ -136,6 +136,8 @@ class PDFHandler(object): Lattice is used by default. suppress_stdout : str (default: False) Suppress logs and warnings. + extra_kwargs : dict, optional (default: {}) + A dict of pdfminer.layout.LAParams kwargs. kwargs : dict See camelot.read_pdf kwargs. @@ -153,6 +155,7 @@ class PDFHandler(object): for p in self.pages] parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs) for p in pages: - t = parser.extract_tables(p, suppress_stdout=suppress_stdout) + t = parser.extract_tables(p, suppress_stdout=suppress_stdout + extra_kwargs=extra_kwargs) tables.extend(t) return TableList(tables) diff --git a/camelot/io.py b/camelot/io.py index 4b436ff..c7321a3 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -6,7 +6,7 @@ from .utils import validate_input, remove_extra def read_pdf(filepath, pages='1', password=None, flavor='lattice', - suppress_stdout=False, **kwargs): + suppress_stdout=False, extra_kwargs={}, **kwargs): """Read PDF and return extracted tables. Note: kwargs annotated with ^ can only be used with flavor='stream' @@ -26,6 +26,8 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice', Lattice is used by default. suppress_stdout : bool, optional (default: True) Print all logs and warnings. + extra_kwargs : dict, optional (default: {}) + A dict of pdfminer.layout.LAParams kwargs. table_areas : list, optional (default: None) List of table area strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom @@ -98,5 +100,6 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice', validate_input(kwargs, flavor=flavor) p = PDFHandler(filepath, pages=pages, password=password) kwargs = remove_extra(kwargs, flavor=flavor) - tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout, **kwargs) + tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout, + extra_kwargs=extra_kwargs, **kwargs) return tables diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index ebc4564..35fcdd1 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -8,13 +8,10 @@ from ..utils import get_page_layout, get_text_objects class BaseParser(object): """Defines a base parser. """ - def _generate_layout(self, filename): + def _generate_layout(self, filename, extra_kwargs): self.filename = filename self.layout, self.dimensions = get_page_layout( - self.filename, - char_margin=self.char_margin, - line_margin=self.line_margin, - word_margin=self.word_margin) + self.filename, **self.extra_kwargs) self.horizontal_text = get_text_objects(self.layout, ltype="lh") self.vertical_text = get_text_objects(self.layout, ltype="lv") self.pdf_width, self.pdf_height = self.dimensions diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index da3524f..b89452e 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -348,8 +348,8 @@ class Lattice(BaseParser): return table - def extract_tables(self, filename, suppress_stdout=False): - self._generate_layout(filename) + def extract_tables(self, filename, suppress_stdout=False, extra_kwargs={}): + self._generate_layout(filename, extra_kwargs) if not suppress_stdout: logger.info('Processing {}'.format(os.path.basename(self.rootname))) diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index 4bf482d..a478e0c 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -388,8 +388,8 @@ class Stream(BaseParser): return table - def extract_tables(self, filename, suppress_stdout=False): - self._generate_layout(filename) + def extract_tables(self, filename, suppress_stdout=False, extra_kwargs={}): + self._generate_layout(filename, extra_kwargs) if not suppress_stdout: logger.info('Processing {}'.format(os.path.basename(self.rootname))) diff --git a/camelot/utils.py b/camelot/utils.py index cd55e4e..c38884f 100644 --- a/camelot/utils.py +++ b/camelot/utils.py @@ -558,7 +558,7 @@ def compute_whitespace(d): def get_page_layout(filename, char_margin=1.0, line_margin=0.5, word_margin=0.1, - detect_vertical=True, all_texts=True): + detect_vertical=True, all_texts=True): """Returns a PDFMiner LTPage object and page dimension of a single page pdf. See https://euske.github.io/pdfminer/ to get definitions of kwargs. From 01dab12fbc4e2bec6ecb22ad144b1ad143477401 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Mon, 17 Dec 2018 11:53:00 +0530 Subject: [PATCH 2/6] Fix SyntaxError --- camelot/handlers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/camelot/handlers.py b/camelot/handlers.py index 091e3b7..82b5c56 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -155,7 +155,7 @@ class PDFHandler(object): for p in self.pages] parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs) for p in pages: - t = parser.extract_tables(p, suppress_stdout=suppress_stdout + t = parser.extract_tables(p, suppress_stdout=suppress_stdout, extra_kwargs=extra_kwargs) tables.extend(t) return TableList(tables) From 6301fee5238bc9edd5b6c741afb54053bc27d4e5 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Mon, 17 Dec 2018 12:00:41 +0530 Subject: [PATCH 3/6] Fix AttributeError --- camelot/parsers/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index 35fcdd1..f091cf1 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -10,8 +10,9 @@ class BaseParser(object): """ def _generate_layout(self, filename, extra_kwargs): self.filename = filename + self.extra_kwargs = extra_kwargs self.layout, self.dimensions = get_page_layout( - self.filename, **self.extra_kwargs) + filename, **extra_kwargs) self.horizontal_text = get_text_objects(self.layout, ltype="lh") self.vertical_text = get_text_objects(self.layout, ltype="lv") self.pdf_width, self.pdf_height = self.dimensions From 48b2dce6334f672662864cedad60f0b1fd181b31 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Wed, 19 Dec 2018 18:19:39 +0530 Subject: [PATCH 4/6] Update advanced docs --- HISTORY.md | 6 ++++++ camelot/handlers.py | 8 ++++---- camelot/io.py | 12 ++++-------- camelot/parsers/base.py | 6 +++--- camelot/parsers/lattice.py | 11 +++-------- camelot/parsers/stream.py | 12 +++--------- docs/user/advanced.rst | 11 +++++++++++ 7 files changed, 34 insertions(+), 32 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 9b06b01..6a0a2e7 100755 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,6 +4,12 @@ Release History master ------ +**Improvements** + +* [#170](https://github.com/socialcopsdev/camelot/issues/170) Add option to pass pdfminer layout kwargs. [#232](https://github.com/socialcopsdev/camelot/pull/232) by Vinayak Mehta. + * Keyword arguments for [pdfminer.layout.LAParams](https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33) can now be passed using `layout_kwargs` in `read_pdf()`. + * The `margins` keyword argument in `read_pdf()` is now deprecated. + 0.5.0 (2018-12-13) ------------------ diff --git a/camelot/handlers.py b/camelot/handlers.py index 82b5c56..35708ee 100644 --- a/camelot/handlers.py +++ b/camelot/handlers.py @@ -125,7 +125,7 @@ class PDFHandler(object): with open(fpath, 'wb') as f: outfile.write(f) - def parse(self, flavor='lattice', suppress_stdout=False, extra_kwargs={}, **kwargs): + def parse(self, flavor='lattice', suppress_stdout=False, layout_kwargs={}, **kwargs): """Extracts tables by calling parser.get_tables on all single page PDFs. @@ -136,8 +136,8 @@ class PDFHandler(object): Lattice is used by default. suppress_stdout : str (default: False) Suppress logs and warnings. - extra_kwargs : dict, optional (default: {}) - A dict of pdfminer.layout.LAParams kwargs. + layout_kwargs : dict, optional (default: {}) + A dict of `pdfminer.layout.LAParams `_ kwargs. kwargs : dict See camelot.read_pdf kwargs. @@ -156,6 +156,6 @@ class PDFHandler(object): parser = Lattice(**kwargs) if flavor == 'lattice' else Stream(**kwargs) for p in pages: t = parser.extract_tables(p, suppress_stdout=suppress_stdout, - extra_kwargs=extra_kwargs) + layout_kwargs=layout_kwargs) tables.extend(t) return TableList(tables) diff --git a/camelot/io.py b/camelot/io.py index c7321a3..de2052b 100644 --- a/camelot/io.py +++ b/camelot/io.py @@ -6,7 +6,7 @@ from .utils import validate_input, remove_extra def read_pdf(filepath, pages='1', password=None, flavor='lattice', - suppress_stdout=False, extra_kwargs={}, **kwargs): + suppress_stdout=False, layout_kwargs={}, **kwargs): """Read PDF and return extracted tables. Note: kwargs annotated with ^ can only be used with flavor='stream' @@ -26,8 +26,8 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice', Lattice is used by default. suppress_stdout : bool, optional (default: True) Print all logs and warnings. - extra_kwargs : dict, optional (default: {}) - A dict of pdfminer.layout.LAParams kwargs. + layout_kwargs : dict, optional (default: {}) + A dict of `pdfminer.layout.LAParams `_ kwargs. table_areas : list, optional (default: None) List of table area strings of the form x1,y1,x2,y2 where (x1, y1) -> left-top and (x2, y2) -> right-bottom @@ -79,10 +79,6 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice', Number of times for erosion/dilation is applied. For more information, refer `OpenCV's dilate `_. - margins : tuple - PDFMiner char_margin, line_margin and word_margin. - - For more information, refer `PDFMiner docs `_. Returns ------- @@ -101,5 +97,5 @@ def read_pdf(filepath, pages='1', password=None, flavor='lattice', p = PDFHandler(filepath, pages=pages, password=password) kwargs = remove_extra(kwargs, flavor=flavor) tables = p.parse(flavor=flavor, suppress_stdout=suppress_stdout, - extra_kwargs=extra_kwargs, **kwargs) + layout_kwargs=layout_kwargs, **kwargs) return tables diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py index f091cf1..a3280de 100644 --- a/camelot/parsers/base.py +++ b/camelot/parsers/base.py @@ -8,11 +8,11 @@ from ..utils import get_page_layout, get_text_objects class BaseParser(object): """Defines a base parser. """ - def _generate_layout(self, filename, extra_kwargs): + def _generate_layout(self, filename, layout_kwargs): self.filename = filename - self.extra_kwargs = extra_kwargs + self.layout_kwargs = layout_kwargs self.layout, self.dimensions = get_page_layout( - filename, **extra_kwargs) + filename, **layout_kwargs) self.horizontal_text = get_text_objects(self.layout, ltype="lh") self.vertical_text = get_text_objects(self.layout, ltype="lv") self.pdf_width, self.pdf_height = self.dimensions diff --git a/camelot/parsers/lattice.py b/camelot/parsers/lattice.py index b89452e..0ec53bd 100644 --- a/camelot/parsers/lattice.py +++ b/camelot/parsers/lattice.py @@ -70,17 +70,13 @@ class Lattice(BaseParser): Number of times for erosion/dilation is applied. For more information, refer `OpenCV's dilate `_. - margins : tuple - PDFMiner char_margin, line_margin and word_margin. - - For more information, refer `PDFMiner docs `_. """ def __init__(self, table_areas=None, process_background=False, line_size_scaling=15, copy_text=None, shift_text=['l', 't'], split_text=False, flag_size=False, line_close_tol=2, joint_close_tol=2, threshold_blocksize=15, threshold_constant=-2, - iterations=0, margins=(1.0, 0.5, 0.1), **kwargs): + iterations=0, **kwargs): self.table_areas = table_areas self.process_background = process_background self.line_size_scaling = line_size_scaling @@ -93,7 +89,6 @@ class Lattice(BaseParser): self.threshold_blocksize = threshold_blocksize self.threshold_constant = threshold_constant self.iterations = iterations - self.char_margin, self.line_margin, self.word_margin = margins @staticmethod def _reduce_index(t, idx, shift_text): @@ -348,8 +343,8 @@ class Lattice(BaseParser): return table - def extract_tables(self, filename, suppress_stdout=False, extra_kwargs={}): - self._generate_layout(filename, extra_kwargs) + def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}): + self._generate_layout(filename, layout_kwargs) if not suppress_stdout: logger.info('Processing {}'.format(os.path.basename(self.rootname))) diff --git a/camelot/parsers/stream.py b/camelot/parsers/stream.py index a478e0c..5ebd2df 100644 --- a/camelot/parsers/stream.py +++ b/camelot/parsers/stream.py @@ -44,15 +44,10 @@ class Stream(BaseParser): col_close_tol : int, optional (default: 0) Tolerance parameter used to combine text horizontally, to generate columns. - margins : tuple, optional (default: (1.0, 0.5, 0.1)) - PDFMiner char_margin, line_margin and word_margin. - - For more information, refer `PDFMiner docs `_. """ def __init__(self, table_areas=None, columns=None, split_text=False, - flag_size=False, row_close_tol=2, col_close_tol=0, - margins=(1.0, 0.5, 0.1), **kwargs): + flag_size=False, row_close_tol=2, col_close_tol=0, **kwargs): self.table_areas = table_areas self.columns = columns self._validate_columns() @@ -60,7 +55,6 @@ class Stream(BaseParser): self.flag_size = flag_size self.row_close_tol = row_close_tol self.col_close_tol = col_close_tol - self.char_margin, self.line_margin, self.word_margin = margins @staticmethod def _text_bbox(t_bbox): @@ -388,8 +382,8 @@ class Stream(BaseParser): return table - def extract_tables(self, filename, suppress_stdout=False, extra_kwargs={}): - self._generate_layout(filename, extra_kwargs) + def extract_tables(self, filename, suppress_stdout=False, layout_kwargs={}): + self._generate_layout(filename, layout_kwargs) if not suppress_stdout: logger.info('Processing {}'.format(os.path.basename(self.rootname))) diff --git a/docs/user/advanced.rst b/docs/user/advanced.rst index 37e8d01..4be85ca 100644 --- a/docs/user/advanced.rst +++ b/docs/user/advanced.rst @@ -524,3 +524,14 @@ We don't need anything else. Now, let's pass ``copy_text=['v']`` to copy text in "4","West Bengal","West Medinipur","iv. Acute Diarrhoeal Disease","145","0","04/01/14","05/01/14","Under control","..." "4","West Bengal","Birbhum","v. Food Poisoning","199","0","31/12/13","31/12/13","Under control","..." "4","West Bengal","Howrah","vi. Viral Hepatitis A &E","85","0","26/12/13","27/12/13","Under surveillance","..." + +Tweak layout generation +----------------------- + +Camelot is built on top of PDFMiner's functionality of grouping characters on a page into words and sentences. In some cases (such as `#170 `_ and `#215 `_), PDFMiner can group characters that should belong to the same sentence into separate sentences. + +To deal with such cases, you can tweak PDFMiner's `LAParams kwargs `_ to improve layout generation, by passing the keyword arguments as a dict using ``layout_kwargs`` in :meth:`read_pdf() `. To know more about the parameters you can tweak, you can check out `PDFMiner docs `_. + +:: + + >>> tables = camelot.read_pdf('foo.pdf', layout_kwargs={'detect_vertical': True}) From 17d48be46efba6bec9dfa1601695bef3642d5eb6 Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Wed, 19 Dec 2018 18:31:54 +0530 Subject: [PATCH 5/6] Add test --- tests/data.py | 40 ++++++++++++++++++++++++++ tests/files/detect_vertical_false.pdf | Bin 0 -> 12705 bytes tests/test_common.py | 9 ++++++ 3 files changed, 49 insertions(+) create mode 100644 tests/files/detect_vertical_false.pdf diff --git a/tests/data.py b/tests/data.py index 677c58b..246a711 100755 --- a/tests/data.py +++ b/tests/data.py @@ -491,3 +491,43 @@ data_arabic = [ ['؟ﻙﺎﺒﺷ\xa0ﺖﻧﺍ\xa0ﻞﻫ', 'ﺔﻳﺰﻴﻠﺠﻧﻻﺍ\xa0ﻲﻓ\xa0Jeremy\xa0ﻲﻤﺳﺍ'], ['Jeremy\xa0is\xa0ﻲﻣﺮﺟ\xa0in\xa0Arabic', ''] ] + +data_stream_layout_kwargs = [ + ['V i n s a u Ve r r e', ''], + ['Les Blancs', '12.5CL'], + ['A.O.P Côtes du Rhône', ''], + ['Domaine de la Guicharde « Autour de la chapelle » 2016', '8 €'], + ['A.O.P Vacqueyras', ''], + ['Domaine de Montvac « Melodine » 2016', '10 €'], + ['A.O.P Châteauneuf du Pape', ''], + ['Domaine de Beaurenard 2017', '13 €'], + ['A.O.P Côteaux du Languedoc', ''], + ['Villa Tempora « Un temps pour elle » 2014', '9 €'], + ['A.O.P Côtes de Provence', ''], + ['Château Grand Boise 2017', '9 €'], + ['Les Rosés', '12,5 CL'], + ['A.O.P Côtes du Rhône', ''], + ['Domaine de la Florane « A fleur de Pampre » 2016', '8 €'], + ['Famille Coulon (Domaine Beaurenard) Biotifulfox 2017', '8 €'], + ['A.O.P Vacqueyras', ''], + ['Domaine de Montvac 2017', '9 €'], + ['A.O.P Languedoc', ''], + ['Domaine de Joncas « Nébla » 2015', '8 €'], + ['Villa Tempora « L’arroseur arrosé » 2015', '9 €'], + ['A.O.P Côtes de Provence', ''], + ['Château Grand Boise « Sainte Victoire » 2017', '9 €'], + ['Château Léoube 2016', '10 €'], + ['Les Rouges', '12,CL'], + ['A.O.P Côtes du Rhône', ''], + ['Domaine de Dionysos « La Cigalette »', '8 €'], + ['Château Saint Estève d’Uchaux « Grande Réserve » 2014', '9 €'], + ['Domaine de la Guicharde « Cuvée Massillan » 2016', '9 €'], + ['Domaine de la Florane « Terre Pourpre » 2014', '10 €'], + ['L’Oratoire St Martin « Réserve des Seigneurs » 2015', '11 €'], + ['A.O.P Saint Joseph', ''], + ['Domaine Monier Perréol « Châtelet » 2015', '13 €'], + ['A.O.P Châteauneuf du Pape', ''], + ['Domaine de Beaurenard 2011', '15 €'], + ['A.O.P Cornas', ''], + ['Domaine Lionnet « Terre Brûlée » 2012', '15 €'] +] diff --git a/tests/files/detect_vertical_false.pdf b/tests/files/detect_vertical_false.pdf new file mode 100644 index 0000000000000000000000000000000000000000..17d8a0d90f3140ebf89f836a7c2d22620cb795da GIT binary patch literal 12705 zcmcI~2UJtdx340IQfvrP^#IZoI7vuo(t8!8OA$f{5K00`s457ENbgNSM0!V1KtU7~ zlwJe{rKt3xf(R-g+>==U-}=^l_pNtdSRr%v?Ac{z&zzax9sxr&4KcVll0o3zM9*^u z1ONjF_AY>&9H@uIJCU6M35a3@fEqYAGL{Jax}nKfH7thUfCUs301}yqMY}UI_-4Gf zwu%R*Ll96Zh&q$HZ%aA~cpo8?Sg9QWQC-atj$RXu~3 zBWvE%i{+}7l^UdVBb)E0gXV+sTey>)MK!@&lVKli@e1F&34vd~Xdo@;X2k-ko9T;| zZ(crPiMRdk@v356{&7V!e!DvUHDPHx_Ij3NcHd{)Yr%?v+r*8jy^c>7`z8C_4YtC& z;Trtp8dFu$%dD5QvQR2aTcKR(oC-}|<=r{vV?Y5&35dG`IM zC3($0)>M>M4~qHEapquFb^QmTKjr&ZcpGhhpS;fMkkE?4SG}=X5Vf#;e$@kT*O@6u zP^<`0b~(g+&RjEBbJ*9ZKPNtVt_7w%$>NQ!~dSx?pM;Lw#5^ z%OV=J!7!zQVKUx)?_Yq;*>y8Q-mtvyv~Y7(e>B^6y4fhBDKsum)Bw^xXy?l?WDxin|0EVba;m@zJ6r8nQtl4qF6<$%8Ps_iEdI~~8b_|#P{p@Yh73e>OJ zAzyG^-?2G2FLI-FhsjMEI#zaH)bnyHmyT>2cP(`-&t!dfudBuPvyLV&ONMl5)bFT~ zvfO$3rALN*sl~3;G5ZJ$c~RyeH;(8nqi&3bX{E|1VSbyt*DLvbbp&dcFu2IAAZCMW zpYOVr#+2VT|46%cuWtERIbA5d$g>ONi41RXI##3NyvkdZ<0=Ugk9#DwG+m$eKS*tk z_ddQm@`X6kw>O6MQYW4UP0y_IAmnDZ2;FykL{u`}&C>)GC9=_wm-7)9a`!X5aMgAW z=d?aUAb;5jwSul-;oDM6@LX@<^FZvJU7h_O*p1E<*ET~{rtj*U<2_vX6tH;lvy+FO2b&8 zx5X#HFUEn`-UWPFcL%J4{s&c!t32p8;&_XyXTjNY9Fg;<<@ zw$9ydGt2!Tu6{S7*}hNgY))K&LvE<3HNAW%t!m*#M8v(I+beIAg3Rq-?^EbaH{Gju zi_}W=UiMS|D3^*Z5AsXGe6U`xDQ(&aw>Sx8Isk1nh<3f84C|Xp2W&(>*s*h|$K}{r?zhw`&hHB5vO7HcK*iCY;W6jYSQ-Vd^gP)M-LaP0 zkAy6<-c-&mvK*IOH(uNmea!SoAhYBi#Pp@un#x8Xr+h0Z-{XzoCkA`zs@m4#1@!lo zluz}NT~&gjGc*aWPIPerk){a0%>Zk8EzMfS_{Jk)`kocZyt4L9FFIJ6p4o2?T8s`~ zq8W>Tp#W1|;iK@F;innFhHS<* zq-fyx8CpY@r01bKmv)DDpJzDV;&CL`FLhB}{nVW*I=`ONd(42j8T5{XP9CczI%c8J zL9ty*uQg1W3+TdXT-@H=kN0?Ba^o&f>un_5$gjKlqq0T;4d;l!YCL9Y7gLFTg4RWk zk`_6vW<=+-3v&%YaO|Q=?tnABy24`4&`hN=`c@I*8g5tYYncSB(A;^7mH z!>@;p?ChK3&NAgRnN;V0Pw;o4leu$nWS6`_2-j7=eRx=mLfxF;yu$SdI+LohCniTr z6Hdw*OuU>L7=3j0M88c{N=LDi%-FE9>_(c^st%pC@9OP=g#m&<=*w^gw)xtJnPuaS zqt5iYYkM-%(Ay|0-|$(%$i@ff)}IUD=?z`O5bv+CcF1jagucx#KKK0oQrK#D+_h#E z<`OMsF^$V*Q4Jb<(-1Ri%czpYM*U>v zg#&B!>SOp8KI=;&&4S&9tjZyeEzg)=%-5{ceED^LLh?rc-oo~`pFacl7dTmlN?p%i zuz1`eGsUlZIAl1zO+EYMh1fUzB2w}A=9cidPE{vK@GIhdHp+tNbci=UO-=d=pVisu ztJnCwt=n_X9ATuXztp1hI5IRb_>$&Wvy?2FE6T<2pcXiFQy}b>?U2z>jJIefUsi)e zALApow(qObj{26T*j2Wg-Z*XU125kO_K^XRz}CV&&QnZpKHm}GQGCSzmgH8ca76o!FfQ%uXe-F=U<&){rV`gp8oAsRV8pUftMGYNin-Z9WBXg? zn)sK=fwy;ymBz}wwC-0Z+}s)L^O!GS`4yH%(sj&(${#V-e}hHG+r=}ZSZN~L`Gh0?6&Oyw(hxc@I6&}y!CL? zx4FTBC%V%gf!j5yZOlE`fsm>b1D7v1z-J_)Nbl zg`yJWx7%Bk1h7?+*{={FtQS`Y=P(zU)IWL7{e!xI&@S;!vuHOq8koPFz?8<7m8Ia< zB0VHYy2^W5{`rRx-~cPW^6;WG;DDxRWBa9!;ms`ZJHpHJeJ8yALwb3NsJhDQPnA$osCc}l7G^#fB3t|i^Vq6 z*Y8k|2HG`q3$lgGdubLVZQCtnXVeG>qZOZOB4cG+>O7k6NFF0+Mkjy3TXUfftiS2+ zX&-l#?*$&dX9%?Ol{1mcb28$@B5E zt-vy=|G93@#8(wW%tQw}VoKEUC}JgSCh=5yk-H<)UemngIBTpSPwl=M6N{obxx;7S z-hszLXM{$b%&PX*?7ymAFUcpj#5Ken8wT<{BIL&=z0?VIRwcf>p&&10bjVrtRfX3s z(0a_kJ=uN1`&&s!#jA)Qjtq|5m&37=i^d^gsbA|Zu6>p0Wu1$+Bqk&ph5D*%cXJe6 zZ?T@~irSyt06Y4w(p`b){YawON(++BLA+JU-*RM{?p&dEe*!qXl)R*-`HGdvGSHPH|GT)?JM7ljv}7P?$hmMu1APGm=j4?N^tOb#5;HNzwLF{K&_WQRAok48spq=LFV& z0}pJH)#><;t}@(u!20(KyFi0^>^{K7;4 zP!&49kZ}aO8g!}wgw} z0(6-F6%YtD?=N?E%3PrE$*K`BUhY^tSzGP@CepzFwk{5IIb zi|9t3Rt^{t>xQM+MuG+z4q5Dgk#!^x-OLDmm66~!42n4!~zf;g+Y5`0T(YE9-zd`kHK-nK`~B= zX{s3Tk7%fbp{)G`;t%P@SkGT-6imrDHyjBHJ}=@wq)@piQhv9E$_{^K`~T47e@Ojp z+kZ&ZU;6y-3HtM5fL`SCLO#&4|F3%j>I?rT-uUAUr0z@B zG$vEt#-M?rJ^=rFGeCX&|KwIdldu{DJQ;dtshI1lTb|S;ke$))VtP0yXNX9t`n705 zZ$}vz^rnMQ%pW>_w9~*5Nn}-LG!aq)>Y;y@rDOom9OpoGCIMCwFc=^SmjMt+82}}X z0Fcl#3Nl$z68e>bLti8yA%Oy<;4nZMfdrrx9gsxAe)35}Z6yF16r9SBl#!y!q0lH_ zgajm;A_tNuAqlmE$`l$!=I=TwhzBJJpdj-lDf|!~AtfPAZ9}yK(j|?8zLJnFPZJe_3JKXE4M;)tQk3?Rlsc%4kN~70U!)+tC^(AxrL>VkP~o6kbXnAt7B9UJ9QL0IK65lo*G10%U&d+9;ct zAA1VQF#N~JQpe{%;@}?xNFC7A&^!IV8E+L8f-hht1_#6t&_GE;#!JgcPz-@EYbO$b zgf?0v473PQ$)GCQLkmj{777wpcCli1xdlW6QGsVk^JY0Na+L6 z8FcpZaK_?69OweNf^MKYhzIdFJQgH?gumnyC=@)_8SUl>dVn5iA{I{-M=kvj<$)zq zdJ=?waRdj@6ZG^VP~4%YA%Y~3g!P8>kZ`^r3EDhjK{7~oruaklA%I?>*I&xKK_Ac; z^aK4te=Lyz?R+Ug0VN`^(6o}GPUycjvsB8@5c;=d#Xq7?_4lS0DgB?DTHlNoi@g?y z&1=eQissoWkj;uhV7duFW*GUGN*f@Ud&-zSv^1)y$2REYkM& zNQs^3I*cLomaQIsEVxu4!1zeFZJf~f68~uiMB#pp+zVOV2rtFESVitmwaNGWS}%0k zUfIt8PZ46vcSG&D_AK-Wvqsb*Z5%#SnVxjmC4Me`Ok6oA=<&s=1>w}Tdon9*Go>EO z9^vQWtNEUS;y$=;oxoH&KfjJvTH3>GZ0j>+mMNzK)n8Q6m2A#Ti_dwr<~>a>++@AS z^jL3y&HI974;GFe>T3tvbs`pU28PcEcd`7%`7Cu-)*QsFb(qJ3GxZ>=x55M#<;cox^am zYdQieJdfc8?5O9dID0mDN+(Ki=ks8VaqRrz?1e-xaoI=cOUG#$HQq*KhS(I1x}Wb;h%GIscDW%tAe_sIxTB%%bT)3)U>sI4 zW9i7Sl#~OX4ie48-VjY2Gi#!)Mln_#bbUMg`h;wdMfJVsbh8}GNzc<7Pl#ON@P9W7 zu)3Vo*IwJq)2a9WAlPZw=J7!=vq{q9PFGTU_vEbBN(s&Q-se}P^ib!kIqgq~039U} z40kg%>M&2tPrZ+_dvnxweQoS9rVr02?{fM1iC(?0HyF0GtMkFb--?fUoruQWaJXc& zYf;kq*kHn?`zWuq3t#w8Ilr+RdSEuYU|pJUpN}c~^?)31Xw2cOCoooMZBz5PV{w;p zO_;WeWq15Nq{dPfY?sqn=6bXkJ)6+;r!p&}^s}aK3``m$Lc~%~pi-o(dgtrXR z+-0g`l>8*U5c5z;v9e)8-+dDPg1^C8@eJmKaQ}7D)riPd9Rw%ymsD{sD^Oh6EA23A z=KG+dv3WFC4+vPL(aJGhsr;}~aXjNd$C{?3L1BJP0)M9JgIVEXl152&Q{?W)qIVY5 zGZ`Q08syKP_YfSrYHBXT!^@67Wv-AXvmBJWo2&A{evHFwOyPUm!e~wF=1Ft4Mjq)* z2FY|1o%xrKO-Hj*&(y+!-Ka6uE_S2<69FpMgmv9`0^~b-k;^mYTrk? zP%omTxpDJIL6tq%Xy%f>RqVNwm`5)d-9BU~c;_4Co<#>oZZoqB_sD&9JSBWgaZmml z;+6LYm#-pF<3z`nD59G!E9IwQ=EK12EmOw&Hn%p4Je={DObQP<=HPm6bP?`xH7 z*qie1V4Mk$Y+9tDZ$pOo@mOw*-gh}Ncr7W2IQ;M@7{GYhe7;x8g?xv!n!d?0p?_&MS< zdo-_3kdK+Q&9IU03EOMs0S3u&*I0O@+%?s!>X7wY$!h{-2|2BIstYY+7Oq-rPOfkY z?0stVF1NllQFE{|GX?vIFy}TH)p=$mxH322m@NY^(-7ezp!CC_ubEu+F^4q<*J3YkOWof28Rt?d~W`)4HuK~-lZwpOU3aYAh(R)j3zjvvU9&KPMh=}Hr@qe)u znQlK_72#Q6e5{SbcL~43?HA!5YMFz}kyqUHVomRzrhU^(?v51FW-5%wNpGU?p-kwM zV)oYY=&qE&yxW^Jo5{gr&RV)>Rtb!DP6D)MuKQoA6g1tCYQDmktt*Qgaxz&VTxP$i zE|6)RXKNu_Av$HyXOxswb;9qPQ~JJjp0Mqq<3;rb`OIG3=5q>t7v=U=zds;7%DVE5 z%jQ__mT-Wf*kw)AJN6%}Vz@5d8#268qoQZ>jb5K4m_xys4^vUd=cn`LnIP|tkh7n6 zUDH=Y$9lpz7|;kY2Llfzea#~`VMi8e(Zm)}@o~QI*Ju(-0+mswz0m?+J2q)}WLdXg zRp)6u60kw#ao>A3$Me0%dS$=fO0d(j@#7(Fc}w5!XS%}Z)2t^4&J|=2o@CJDXS_?l zr}^A&p?SLdmpYprn$J&1%P2Fl6==uE7;S}hD4bd8IM?Iwtx8JA=i$XibGJ1rP+QMKKto9v*McPn1cSn^doF4>VW#v)GIyVh^>GF0 zX)ud=^ToN(x7#J^OoW|~VuO7p#ihnt>FiBC8ExkBPWsDV)@|Q*RQQ{->^3wEse75W z?%^<6t+2S$xLI!n(`~JF^o#UJ%%(*BA+l*itK}_~J$;Gw{ox^ZH4iqI$`mm+A2$%G zT-ARw=pb7AmVa*790tY2NQ>f8Y2h76@NcW$9rB89cZ3Oja`V$i0Y2XwX)oqs8n0g| zM&-W-FDH<1nO#A9iS00F-``gCrEP$td5&hYu74SMRx*XD8_CUfwZ(s@s|h3i0B>xR z(x`WnE3I=8S9F}FhTRA2rH7nfPv(mjFx-ltbc>sGTZ}qRAK2&2|MGV2=H?NBkB#5* zEXvw9gaaq+aWTliu1{0CspU7s1d5&bhO@b8`8wR;j|}94XU98;RzUo&{cj%984T|f zdMP;{%d^|u)%(yJA*H;*F|Ayk&d#_u>@F{JMtKL{R0!>|eW4<4pn92*JI#}v9JcSh zbCTuYg;U`J_i|1g?TIM=ct$-MTuqAX-GvM&jBtw0C_T(?G7vD}@XAMJ;Pl0YI`W5a zpKGRG3OubHgC3&_;hrlD32-Zl^8c^yFuId^Bw1QTld~KrKML^;F}Ec z8SnQ7x)8pyRz`#*-wAe~uAC^Q>nP)Kf|bPx2cbgO)z4SA=lDcFSsx(X{Y;?A2(2}F zR4e!u&3%;rkh86xuIGNSpmC1(`-O}SL~baiErzl;}9@{_2`y zeSCh~m8W+Ulz+UGe0NP*kDqy1_i56!2`|2-IMOLrzF{CEYe4jaJhBM=3GZY5*t-!k zMJIaihJJ8m=z-1ayV8wl89od>qpi(OT&syuLI;f2+@Tu%R@y6PhvTihvSVx8G7ZY-L%VF-)8bZ*?5J$)UbMM<~EE6oU> zT8_B1-Le#zduVpav-iv1PH({*bB>Cv%=@{+={~Z4=8s|vH}5Iqu$y8)KiF~d14r_g zMTc%B;qePIPey2ri${hxo&^tok1_0O+UquRo|g1Qg8qKTFE^k3TA*63tLYxJKCQg@vxoiDU_~GNpNBu5qO#q}bH+G zx_(C4cZz+J{mHFj=RRz7J+(?z_JnKF5E&U5CE{++D_bUf$)+*v8`b zp1htscdnX?hPQS<2--^D;1qpbF`62XG>=$#Jjm&5bHBW04LP098L#xNLvp@3nTnSZgy*z=~|M0+bK{Lzaj+_fi$~K|5Z{Jo) zy|TX_;T2xDc@h2kOpBlJnHyp5G)dff0^0{~6p-F9?j1EM zI@H?lyjh9N$Zu!K4Niy&ojrPwwExj}^Q&^O&um{WovIsedXFoV6LpZrn~r+61={j~ z9&}H*%~YaB(Z=)EF)><0pH4`dV&3o=G5cAT0mGU}yH8c!YF%t@;?sJ3rTf-e(r1haaWVE6Ts>>s*mo&EHpguZJ%?IApN5<$aV%!`&4AlUs58^@m6T;Wb>~Jvfx%0wLaz@9FieoRm(LXg72Q+ zlzDgTtufn&!8CeO=;!XHspL~pSy?IELGF=j%5n`)LyfUdo`eV$M666_rg@WEV-ZKU zw>+7cd2JcC?9$^l9Hfm3chND1QdnCEnQ2f!JXJ|#EGn*0w0j7NIIuU)lxuN9k zci~s8AKzkgJ?x}ibBN|Wr*xg&C-j%=)lP?w>0O_al^-x|R(Pny8|FJViOZpH zXJ)CyR8Gv)Urh{sS=h?uod2HqID|emvDstu*_rTjeA{u}XARLK_arY2dCna7cr=h_6;8JD^e!wOEaPImXKHy*1Sb1iys>xgFw=P@PuqR2W{Zw3<%@3-QOLxV(? zcyn9kM4X84riY6_qx743-h12MCB$FFBDW+SW(Jhl>+j^Qh&y5D)M6F-ivMoh+4JU& zLn~Gb0M}85O#PREN+mw;D(Jr&FfOY`ZV9P;v@5}Rnq;xZ^qqdDqos9YuvgGt(>&l^ z+MegmIgbin_VpB|m(3^zTP?|KM(a;~tymytb$vBwi#$m?97U?&7_J*~wcZRIjSY;b zdvHAT@;P%d@?=lq_%=}$9)JJZ{lj7wvR{fy469Oid(nkFpvCYkAH$dN7M9pm?8=;+ zQU=j2y-sXL#d(-crQ%i>@+h)C|*``az=G$J|!nkwd(oxn*!5yJhJY?AM zCOfUP+K`x=GK*Hl=FZ*VXuwWd?h4a$%79(}gN-*L%Ax`Cj}lMcH?MrQGx$Pd!P+67 zNZXN5{lKx+z)X2Be%hYctSIo9;)CUt<;3OG*evU|oIr*gXNloE9Ji!M$XtuO+S0qP zYKLF>Z?i1~Fs=9;VvP#_SKyus>HY^``y1^ADbW0%CvzNL8BfCfA*kUT9kCE@h`~Y- zI|RK`Q9uZd13(-AdIKPI$rwU;9RLsmfKVbDLK^`p{s&MHMJh%Msz77_gkXL@0K@~J zI{?~4asbc`0KEXvnaT-akI*k9iwcqgpd$c!LNX~BDkPT*lR}i=&?Epl0iZ7gF+y^E zAmKltVJd(MLA=oOA6yc8_}SbTN>Tuz4*ksQ0N5Go&VOi1K`lVUmqypghD}SDX0RUf-!)LS>30nZfrKvY{L%m=^$@@8K_O(IPXARy zEx=Gxl7DeSDF1Ick`l^F@}L_=u@A~O^%Q1tp^p>ae((?t!i5>`DzuQr-N+v9`8hL73kU_^j_Awe-{ rL?BRx|3^@P+IU9-lsEeoAIuCSGMY&KF`94$0s+}8AfRrb!OZYqVWQwG literal 0 HcmV?d00001 diff --git a/tests/test_common.py b/tests/test_common.py index 5f8c81c..34ae94f 100644 --- a/tests/test_common.py +++ b/tests/test_common.py @@ -102,6 +102,15 @@ def test_stream_flag_size(): assert df.equals(tables[0].df) +def test_stream_layout_kwargs(): + df = pd.DataFrame(data_stream_layout_kwargs) + + filename = os.path.join(testdir, "detect_vertical_false.pdf") + tables = camelot.read_pdf( + filename, flavor="stream", layout_kwargs={"detect_vertical": False}) + assert df.equals(tables[0].df) + + def test_lattice(): df = pd.DataFrame(data_lattice) From e0cb93513085b9493daafd97b50af07ebfd282dd Mon Sep 17 00:00:00 2001 From: Vinayak Mehta Date: Wed, 19 Dec 2018 18:45:17 +0530 Subject: [PATCH 6/6] Fix docs --- docs/user/advanced.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user/advanced.rst b/docs/user/advanced.rst index 4be85ca..a36ea2c 100644 --- a/docs/user/advanced.rst +++ b/docs/user/advanced.rst @@ -534,4 +534,4 @@ To deal with such cases, you can tweak PDFMiner's `LAParams kwargs >> tables = camelot.read_pdf('foo.pdf', layout_kwargs={'detect_vertical': True}) + >>> tables = camelot.read_pdf('foo.pdf', layout_kwargs={'detect_vertical': False})